Skip to content

Commit 755e008

Browse files
committed
[X86] Remove isel patterns for X86VBroadcast+trunc+extload. Replace with DAG combines.
This is a little more complicated than I'd like it to be. We have to manually match a trunc+srl+load pattern that generic DAG combine won't do for us due to isTypeDesirableForOp.
1 parent 7c50454 commit 755e008

File tree

6 files changed

+79
-135
lines changed

6 files changed

+79
-135
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 67 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35231,22 +35231,74 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
3523135231
// Due to isTypeDesirableForOp, we won't always shrink a load truncated to
3523235232
// i16. So shrink it ourselves if we can make a broadcast_load.
3523335233
if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
35234-
Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
35235-
Src.getOperand(0).hasOneUse()) {
35234+
Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
3523635235
assert(Subtarget.hasAVX2() && "Expected AVX2");
35237-
LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
35238-
if (LN->isSimple()) {
35239-
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35240-
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35241-
SDValue BcastLd =
35242-
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35243-
MVT::i16, LN->getPointerInfo(),
35244-
LN->getAlignment(),
35245-
LN->getMemOperand()->getFlags());
35246-
DCI.CombineTo(N.getNode(), BcastLd);
35247-
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35248-
DCI.recursivelyDeleteUnusedNodes(LN);
35249-
return N; // Return N so it doesn't get rechecked!
35236+
SDValue TruncIn = Src.getOperand(0);
35237+
35238+
// If this is a truncate of a non extending load we can just narrow it to
35239+
// use a broadcast_load.
35240+
if (ISD::isNormalLoad(TruncIn.getNode())) {
35241+
LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
35242+
// Unless its volatile or atomic.
35243+
if (LN->isSimple()) {
35244+
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35245+
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35246+
SDValue BcastLd =
35247+
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35248+
MVT::i16, LN->getPointerInfo(),
35249+
LN->getAlignment(),
35250+
LN->getMemOperand()->getFlags());
35251+
DCI.CombineTo(N.getNode(), BcastLd);
35252+
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35253+
DCI.recursivelyDeleteUnusedNodes(LN);
35254+
return N; // Return N so it doesn't get rechecked!
35255+
}
35256+
}
35257+
35258+
// If this is a truncate of an i16 extload, we can directly replace it.
35259+
if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
35260+
ISD::isEXTLoad(Src.getOperand(0).getNode())) {
35261+
LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
35262+
if (LN->getMemoryVT().getSizeInBits() == 16) {
35263+
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35264+
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35265+
SDValue BcastLd =
35266+
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35267+
LN->getMemoryVT(), LN->getMemOperand());
35268+
DCI.CombineTo(N.getNode(), BcastLd);
35269+
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35270+
DCI.recursivelyDeleteUnusedNodes(LN);
35271+
return N; // Return N so it doesn't get rechecked!
35272+
}
35273+
}
35274+
35275+
// If this is a truncate of load that has been shifted right, we can
35276+
// offset the pointer and use a narrower load.
35277+
if (TruncIn.getOpcode() == ISD::SRL &&
35278+
TruncIn.getOperand(0).hasOneUse() &&
35279+
isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
35280+
ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
35281+
LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
35282+
unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
35283+
// Make sure the shift amount and the load size are divisible by 16.
35284+
// Don't do this if the load is volatile or atomic.
35285+
if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
35286+
LN->isSimple()) {
35287+
unsigned Offset = ShiftAmt / 8;
35288+
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35289+
SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
35290+
SDValue Ops[] = { LN->getChain(), Ptr };
35291+
SDValue BcastLd =
35292+
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35293+
MVT::i16,
35294+
LN->getPointerInfo().getWithOffset(Offset),
35295+
MinAlign(LN->getAlignment(), Offset),
35296+
LN->getMemOperand()->getFlags());
35297+
DCI.CombineTo(N.getNode(), BcastLd);
35298+
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35299+
DCI.recursivelyDeleteUnusedNodes(LN);
35300+
return N; // Return N so it doesn't get rechecked!
35301+
}
3525035302
}
3525135303
}
3525235304

llvm/lib/Target/X86/X86InstrAVX512.td

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,53 +1423,6 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
14231423
AVX5128IBase, EVEX;
14241424
}
14251425

1426-
let Predicates = [HasVLX, HasBWI] in {
1427-
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
1428-
// This means we'll encounter truncated i32 loads; match that here.
1429-
def : Pat<(v8i16 (X86VBroadcast
1430-
(i16 (trunc (extloadi32i16 addr:$src))))),
1431-
(VPBROADCASTWZ128rm addr:$src)>;
1432-
def : Pat<(v8i16 (X86VBroadcast
1433-
(i16 (trunc (zextloadi32i16 addr:$src))))),
1434-
(VPBROADCASTWZ128rm addr:$src)>;
1435-
def : Pat<(v16i16 (X86VBroadcast
1436-
(i16 (trunc (extloadi32i16 addr:$src))))),
1437-
(VPBROADCASTWZ256rm addr:$src)>;
1438-
def : Pat<(v16i16 (X86VBroadcast
1439-
(i16 (trunc (zextloadi32i16 addr:$src))))),
1440-
(VPBROADCASTWZ256rm addr:$src)>;
1441-
1442-
def : Pat<(v8i16 (X86VBroadcast
1443-
(i16 (trunc (extloadi64i16 addr:$src))))),
1444-
(VPBROADCASTWZ128rm addr:$src)>;
1445-
def : Pat<(v8i16 (X86VBroadcast
1446-
(i16 (trunc (zextloadi64i16 addr:$src))))),
1447-
(VPBROADCASTWZ128rm addr:$src)>;
1448-
def : Pat<(v16i16 (X86VBroadcast
1449-
(i16 (trunc (extloadi64i16 addr:$src))))),
1450-
(VPBROADCASTWZ256rm addr:$src)>;
1451-
def : Pat<(v16i16 (X86VBroadcast
1452-
(i16 (trunc (zextloadi64i16 addr:$src))))),
1453-
(VPBROADCASTWZ256rm addr:$src)>;
1454-
}
1455-
let Predicates = [HasBWI] in {
1456-
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
1457-
// This means we'll encounter truncated i32 loads; match that here.
1458-
def : Pat<(v32i16 (X86VBroadcast
1459-
(i16 (trunc (extloadi32i16 addr:$src))))),
1460-
(VPBROADCASTWZrm addr:$src)>;
1461-
def : Pat<(v32i16 (X86VBroadcast
1462-
(i16 (trunc (zextloadi32i16 addr:$src))))),
1463-
(VPBROADCASTWZrm addr:$src)>;
1464-
1465-
def : Pat<(v32i16 (X86VBroadcast
1466-
(i16 (trunc (extloadi64i16 addr:$src))))),
1467-
(VPBROADCASTWZrm addr:$src)>;
1468-
def : Pat<(v32i16 (X86VBroadcast
1469-
(i16 (trunc (zextloadi64i16 addr:$src))))),
1470-
(VPBROADCASTWZrm addr:$src)>;
1471-
}
1472-
14731426
//===----------------------------------------------------------------------===//
14741427
// AVX-512 BROADCAST SUBVECTORS
14751428
//

llvm/lib/Target/X86/X86InstrSSE.td

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7514,36 +7514,6 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastl
75147514
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
75157515
v2i64, v4i64, NoVLX>;
75167516

7517-
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7518-
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7519-
// This means we'll encounter truncated i32 loads; match that here.
7520-
def : Pat<(v8i16 (X86VBroadcast
7521-
(i16 (trunc (extloadi32i16 addr:$src))))),
7522-
(VPBROADCASTWrm addr:$src)>;
7523-
def : Pat<(v8i16 (X86VBroadcast
7524-
(i16 (trunc (zextloadi32i16 addr:$src))))),
7525-
(VPBROADCASTWrm addr:$src)>;
7526-
def : Pat<(v16i16 (X86VBroadcast
7527-
(i16 (trunc (extloadi32i16 addr:$src))))),
7528-
(VPBROADCASTWYrm addr:$src)>;
7529-
def : Pat<(v16i16 (X86VBroadcast
7530-
(i16 (trunc (zextloadi32i16 addr:$src))))),
7531-
(VPBROADCASTWYrm addr:$src)>;
7532-
7533-
def : Pat<(v8i16 (X86VBroadcast
7534-
(i16 (trunc (extloadi64i16 addr:$src))))),
7535-
(VPBROADCASTWrm addr:$src)>;
7536-
def : Pat<(v8i16 (X86VBroadcast
7537-
(i16 (trunc (zextloadi64i16 addr:$src))))),
7538-
(VPBROADCASTWrm addr:$src)>;
7539-
def : Pat<(v16i16 (X86VBroadcast
7540-
(i16 (trunc (extloadi64i16 addr:$src))))),
7541-
(VPBROADCASTWYrm addr:$src)>;
7542-
def : Pat<(v16i16 (X86VBroadcast
7543-
(i16 (trunc (zextloadi64i16 addr:$src))))),
7544-
(VPBROADCASTWYrm addr:$src)>;
7545-
}
7546-
75477517
let Predicates = [HasAVX2, NoVLX] in {
75487518
// Provide fallback in case the load node that is used in the patterns above
75497519
// is used by additional users, which prevents the pattern selection.

llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3280,20 +3280,10 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(i64* %ptr) {
32803280
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
32813281
; AVX1-NEXT: retq
32823282
;
3283-
; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i64:
3284-
; AVX2: # %bb.0:
3285-
; AVX2-NEXT: movq (%rdi), %rax
3286-
; AVX2-NEXT: shrq $16, %rax
3287-
; AVX2-NEXT: vmovd %eax, %xmm0
3288-
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3289-
; AVX2-NEXT: retq
3290-
;
3291-
; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_i64:
3292-
; AVX512VL: # %bb.0:
3293-
; AVX512VL-NEXT: movq (%rdi), %rax
3294-
; AVX512VL-NEXT: shrq $16, %rax
3295-
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
3296-
; AVX512VL-NEXT: retq
3283+
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i64:
3284+
; AVX2OR512VL: # %bb.0:
3285+
; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0
3286+
; AVX2OR512VL-NEXT: retq
32973287
;
32983288
; XOPAVX1-LABEL: insert_dup_elt1_mem_v8i16_i64:
32993289
; XOPAVX1: # %bb.0:
@@ -3304,10 +3294,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(i64* %ptr) {
33043294
;
33053295
; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i64:
33063296
; XOPAVX2: # %bb.0:
3307-
; XOPAVX2-NEXT: movq (%rdi), %rax
3308-
; XOPAVX2-NEXT: shrq $16, %rax
3309-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
3310-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
3297+
; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %xmm0
33113298
; XOPAVX2-NEXT: retq
33123299
%tmp = load i64, i64* %ptr, align 4
33133300
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0

llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7500,20 +7500,10 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
75007500
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
75017501
; AVX1-NEXT: retq
75027502
;
7503-
; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i64:
7504-
; AVX2: # %bb.0:
7505-
; AVX2-NEXT: movq (%rdi), %rax
7506-
; AVX2-NEXT: shrq $16, %rax
7507-
; AVX2-NEXT: vmovd %eax, %xmm0
7508-
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7509-
; AVX2-NEXT: retq
7510-
;
7511-
; AVX512VL-LABEL: insert_dup_elt1_mem_v16i16_i64:
7512-
; AVX512VL: # %bb.0:
7513-
; AVX512VL-NEXT: movq (%rdi), %rax
7514-
; AVX512VL-NEXT: shrq $16, %rax
7515-
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
7516-
; AVX512VL-NEXT: retq
7503+
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i64:
7504+
; AVX2OR512VL: # %bb.0:
7505+
; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0
7506+
; AVX2OR512VL-NEXT: retq
75177507
;
75187508
; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i16_i64:
75197509
; XOPAVX1: # %bb.0:
@@ -7525,10 +7515,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
75257515
;
75267516
; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i16_i64:
75277517
; XOPAVX2: # %bb.0:
7528-
; XOPAVX2-NEXT: movq (%rdi), %rax
7529-
; XOPAVX2-NEXT: shrq $16, %rax
7530-
; XOPAVX2-NEXT: vmovd %eax, %xmm0
7531-
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
7518+
; XOPAVX2-NEXT: vpbroadcastw 2(%rdi), %ymm0
75327519
; XOPAVX2-NEXT: retq
75337520
%tmp = load i64, i64* %ptr, align 4
75347521
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0

llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -310,18 +310,13 @@ define <32 x i16> @insert_dup_mem_v16i16_i64(i64* %ptr) {
310310
define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
311311
; KNL-LABEL: insert_dup_elt1_mem_v16i16_i64:
312312
; KNL: ## %bb.0:
313-
; KNL-NEXT: movq (%rdi), %rax
314-
; KNL-NEXT: shrq $16, %rax
315-
; KNL-NEXT: vmovd %eax, %xmm0
316-
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
313+
; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
317314
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
318315
; KNL-NEXT: retq
319316
;
320317
; SKX-LABEL: insert_dup_elt1_mem_v16i16_i64:
321318
; SKX: ## %bb.0:
322-
; SKX-NEXT: movq (%rdi), %rax
323-
; SKX-NEXT: shrq $16, %rax
324-
; SKX-NEXT: vpbroadcastw %eax, %zmm0
319+
; SKX-NEXT: vpbroadcastw 2(%rdi), %zmm0
325320
; SKX-NEXT: retq
326321
%tmp = load i64, i64* %ptr, align 4
327322
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0

0 commit comments

Comments
 (0)