Skip to content

Commit af647e2

Browse files
jayfoadDavid Salinas
authored and
David Salinas
committed
[AMDGPU] Move architected SGPR implementation into isel (llvm#79120)
Change-Id: Ifc34a16bf7fce67902ce85fbfe2622c82c35213d
1 parent 40c42c6 commit af647e2

8 files changed

+696
-222
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

+37-2
Original file line numberDiff line numberDiff line change
@@ -4178,10 +4178,45 @@ bool AMDGPULegalizerInfo::loadInputValue(
41784178
Register DstReg, MachineIRBuilder &B,
41794179
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
41804180
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4181-
const ArgDescriptor *Arg;
4181+
const ArgDescriptor *Arg = nullptr;
41824182
const TargetRegisterClass *ArgRC;
41834183
LLT ArgTy;
4184-
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4184+
4185+
CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4186+
const ArgDescriptor WorkGroupIDX =
4187+
ArgDescriptor::createRegister(AMDGPU::TTMP9);
4188+
// If GridZ is not programmed in an entry function then the hardware will set
4189+
// it to all zeros, so there is no need to mask the GridY value in the low
4190+
// order bits.
4191+
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4192+
AMDGPU::TTMP7,
4193+
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4194+
const ArgDescriptor WorkGroupIDZ =
4195+
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4196+
if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
4197+
switch (ArgType) {
4198+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4199+
Arg = &WorkGroupIDX;
4200+
ArgRC = &AMDGPU::SReg_32RegClass;
4201+
ArgTy = LLT::scalar(32);
4202+
break;
4203+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4204+
Arg = &WorkGroupIDY;
4205+
ArgRC = &AMDGPU::SReg_32RegClass;
4206+
ArgTy = LLT::scalar(32);
4207+
break;
4208+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4209+
Arg = &WorkGroupIDZ;
4210+
ArgRC = &AMDGPU::SReg_32RegClass;
4211+
ArgTy = LLT::scalar(32);
4212+
break;
4213+
default:
4214+
break;
4215+
}
4216+
}
4217+
4218+
if (!Arg)
4219+
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
41854220

41864221
if (!Arg) {
41874222
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+49-19
Original file line numberDiff line numberDiff line change
@@ -2063,11 +2063,45 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
20632063
const SIMachineFunctionInfo &MFI,
20642064
EVT VT,
20652065
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2066-
const ArgDescriptor *Reg;
2066+
const ArgDescriptor *Reg = nullptr;
20672067
const TargetRegisterClass *RC;
20682068
LLT Ty;
20692069

2070-
std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2070+
CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2071+
const ArgDescriptor WorkGroupIDX =
2072+
ArgDescriptor::createRegister(AMDGPU::TTMP9);
2073+
// If GridZ is not programmed in an entry function then the hardware will set
2074+
// it to all zeros, so there is no need to mask the GridY value in the low
2075+
// order bits.
2076+
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2077+
AMDGPU::TTMP7,
2078+
AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2079+
const ArgDescriptor WorkGroupIDZ =
2080+
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2081+
if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
2082+
switch (PVID) {
2083+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2084+
Reg = &WorkGroupIDX;
2085+
RC = &AMDGPU::SReg_32RegClass;
2086+
Ty = LLT::scalar(32);
2087+
break;
2088+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2089+
Reg = &WorkGroupIDY;
2090+
RC = &AMDGPU::SReg_32RegClass;
2091+
Ty = LLT::scalar(32);
2092+
break;
2093+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2094+
Reg = &WorkGroupIDZ;
2095+
RC = &AMDGPU::SReg_32RegClass;
2096+
Ty = LLT::scalar(32);
2097+
break;
2098+
default:
2099+
break;
2100+
}
2101+
}
2102+
2103+
if (!Reg)
2104+
std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
20712105
if (!Reg) {
20722106
if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
20732107
// It's possible for a kernarg intrinsic call to appear in a kernel with
@@ -2496,28 +2530,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
24962530
}
24972531
}
24982532

2499-
if (Info.hasWorkGroupIDX()) {
2500-
Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
2501-
if (!HasArchitectedSGPRs)
2533+
if (!HasArchitectedSGPRs) {
2534+
if (Info.hasWorkGroupIDX()) {
2535+
Register Reg = Info.addWorkGroupIDX();
25022536
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2537+
CCInfo.AllocateReg(Reg);
2538+
}
25032539

2504-
CCInfo.AllocateReg(Reg);
2505-
}
2506-
2507-
if (Info.hasWorkGroupIDY()) {
2508-
Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
2509-
if (!HasArchitectedSGPRs)
2540+
if (Info.hasWorkGroupIDY()) {
2541+
Register Reg = Info.addWorkGroupIDY();
25102542
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2543+
CCInfo.AllocateReg(Reg);
2544+
}
25112545

2512-
CCInfo.AllocateReg(Reg);
2513-
}
2514-
2515-
if (Info.hasWorkGroupIDZ()) {
2516-
Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2517-
if (!HasArchitectedSGPRs)
2546+
if (Info.hasWorkGroupIDZ()) {
2547+
Register Reg = Info.addWorkGroupIDZ();
25182548
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2519-
2520-
CCInfo.AllocateReg(Reg);
2549+
CCInfo.AllocateReg(Reg);
2550+
}
25212551
}
25222552

25232553
if (Info.hasWorkGroupInfo()) {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

+9-23
Original file line numberDiff line numberDiff line change
@@ -753,35 +753,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
753753
}
754754

755755
// Add system SGPRs.
756-
Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
757-
Register Reg =
758-
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
759-
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
760-
if (!HasArchitectedSGPRs)
761-
NumSystemSGPRs += 1;
762-
756+
Register addWorkGroupIDX() {
757+
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
758+
NumSystemSGPRs += 1;
763759
return ArgInfo.WorkGroupIDX.getRegister();
764760
}
765761

766-
Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
767-
Register Reg =
768-
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
769-
unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
770-
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
771-
if (!HasArchitectedSGPRs)
772-
NumSystemSGPRs += 1;
773-
762+
Register addWorkGroupIDY() {
763+
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
764+
NumSystemSGPRs += 1;
774765
return ArgInfo.WorkGroupIDY.getRegister();
775766
}
776767

777-
Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
778-
Register Reg =
779-
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
780-
unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
781-
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
782-
if (!HasArchitectedSGPRs)
783-
NumSystemSGPRs += 1;
784-
768+
Register addWorkGroupIDZ() {
769+
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
770+
NumSystemSGPRs += 1;
785771
return ArgInfo.WorkGroupIDZ.getRegister();
786772
}
787773

llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll

+56-29
Original file line numberDiff line numberDiff line change
@@ -8,35 +8,62 @@
88
; FIXME: Passing real values for workitem ID, and 0s that can be undef
99

1010
define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
11-
; CHECK-LABEL: indirect_call_known_no_special_inputs:
12-
; CHECK: ; %bb.0: ; %bb
13-
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
14-
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
15-
; CHECK-NEXT: s_add_u32 s0, s0, s7
16-
; CHECK-NEXT: s_addc_u32 s1, s1, 0
17-
; CHECK-NEXT: s_mov_b64 s[4:5], 0
18-
; CHECK-NEXT: s_load_dword s7, s[4:5], 0x0
19-
; CHECK-NEXT: s_getpc_b64 s[4:5]
20-
; CHECK-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
21-
; CHECK-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
22-
; CHECK-NEXT: s_getpc_b64 s[8:9]
23-
; CHECK-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4
24-
; CHECK-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
25-
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
26-
; CHECK-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
27-
; CHECK-NEXT: s_mov_b64 s[8:9], 0
28-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
29-
; CHECK-NEXT: s_and_b32 s4, 1, s7
30-
; CHECK-NEXT: s_cmp_eq_u32 s4, 1
31-
; CHECK-NEXT: v_mov_b32_e32 v31, v0
32-
; CHECK-NEXT: s_cselect_b32 s5, s13, s11
33-
; CHECK-NEXT: s_cselect_b32 s4, s12, s10
34-
; CHECK-NEXT: s_mov_b32 s12, s6
35-
; CHECK-NEXT: v_mov_b32_e32 v1, 0
36-
; CHECK-NEXT: v_mov_b32_e32 v4, 0
37-
; CHECK-NEXT: s_mov_b32 s32, 0
38-
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
39-
; CHECK-NEXT: s_endpgm
11+
; GFX9-LABEL: indirect_call_known_no_special_inputs:
12+
; GFX9: ; %bb.0: ; %bb
13+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7
14+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
15+
; GFX9-NEXT: s_add_u32 s0, s0, s7
16+
; GFX9-NEXT: s_addc_u32 s1, s1, 0
17+
; GFX9-NEXT: s_mov_b64 s[4:5], 0
18+
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0
19+
; GFX9-NEXT: s_getpc_b64 s[4:5]
20+
; GFX9-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
21+
; GFX9-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
22+
; GFX9-NEXT: s_getpc_b64 s[8:9]
23+
; GFX9-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4
24+
; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
25+
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
26+
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
27+
; GFX9-NEXT: s_mov_b64 s[8:9], 0
28+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
29+
; GFX9-NEXT: s_and_b32 s4, 1, s7
30+
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
31+
; GFX9-NEXT: v_mov_b32_e32 v31, v0
32+
; GFX9-NEXT: s_cselect_b32 s5, s13, s11
33+
; GFX9-NEXT: s_cselect_b32 s4, s12, s10
34+
; GFX9-NEXT: s_mov_b32 s12, s6
35+
; GFX9-NEXT: v_mov_b32_e32 v1, 0
36+
; GFX9-NEXT: v_mov_b32_e32 v4, 0
37+
; GFX9-NEXT: s_mov_b32 s32, 0
38+
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
39+
; GFX9-NEXT: s_endpgm
40+
;
41+
; GFX12-LABEL: indirect_call_known_no_special_inputs:
42+
; GFX12: ; %bb.0: ; %bb
43+
; GFX12-NEXT: s_getpc_b64 s[2:3]
44+
; GFX12-NEXT: s_sext_i32_i16 s3, s3
45+
; GFX12-NEXT: s_add_co_u32 s2, s2, snork@gotpcrel32@lo+8
46+
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, snork@gotpcrel32@hi+16
47+
; GFX12-NEXT: s_mov_b64 s[0:1], 0
48+
; GFX12-NEXT: s_getpc_b64 s[4:5]
49+
; GFX12-NEXT: s_sext_i32_i16 s5, s5
50+
; GFX12-NEXT: s_add_co_u32 s4, s4, wobble@gotpcrel32@lo+8
51+
; GFX12-NEXT: s_add_co_ci_u32 s5, s5, wobble@gotpcrel32@hi+16
52+
; GFX12-NEXT: s_load_u8 s6, s[0:1], 0x0
53+
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
54+
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
55+
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
56+
; GFX12-NEXT: v_mov_b32_e32 v31, v0
57+
; GFX12-NEXT: s_mov_b64 s[8:9], 0
58+
; GFX12-NEXT: s_mov_b32 s32, 0
59+
; GFX12-NEXT: s_wait_kmcnt 0x0
60+
; GFX12-NEXT: s_and_b32 s4, 1, s6
61+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
62+
; GFX12-NEXT: s_cmp_eq_u32 s4, 1
63+
; GFX12-NEXT: s_cselect_b32 s1, s3, s1
64+
; GFX12-NEXT: s_cselect_b32 s0, s2, s0
65+
; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
66+
; GFX12-NEXT: s_endpgm
4067

4168
bb:
4269
%cond = load i1, ptr addrspace(4) null

0 commit comments

Comments
 (0)