-
Notifications
You must be signed in to change notification settings - Fork 13.7k
Revert "[X86] Fold (add X, (srl Y, 7)) -> (sub X, (ashr Y, 7)) on vXi8 vectors" #143303
New issue
Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? # to your account
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
…8 vector…" This reverts commit 4e676a1.
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesReverts llvm/llvm-project#143106 as this is causing infinite loops in #143238 Patch is 89.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143303.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 34e3f52bf7ff9..a9dbb29258578 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58110,14 +58110,13 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
}
}
+ // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
+ // (sub Y, (sext (vXi1 X))).
+ // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
+ // generic DAG combine without a legal type check, but adding this there
+ // caused regressions.
if (VT.isVector()) {
SDValue X, Y;
-
- // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
- // (sub Y, (sext (vXi1 X))).
- // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
- // in generic DAG combine without a legal type check, but adding this there
- // caused regressions.
EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorElementCount());
if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
@@ -58126,15 +58125,6 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
}
-
- // Fold (add X, (srl Y, 7)) -> (sub X, (ashr Y, 7)) to undo instcombine
- // canonicalisation as we don't have good vXi8 shifts.
- if (VT.getScalarType() == MVT::i8 &&
- sd_match(N, m_Add(m_Value(X), m_Srl(m_Value(Y), m_SpecificInt(7))))) {
- SDValue AShr = DAG.getNode(ISD::SRA, DL, VT, Y,
- DAG.getShiftAmountConstant(7, VT, DL));
- return DAG.getNode(ISD::SUB, DL, VT, X, AShr);
- }
}
// Peephole for 512-bit VPDPBSSD on non-VLX targets.
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 425cda1acb1fe..3cc17c1f2b86a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -174,18 +174,19 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE-NEXT: pmulhw %xmm3, %xmm4
-; SSE-NEXT: psrlw $8, %xmm4
-; SSE-NEXT: packuswb %xmm2, %xmm4
-; SSE-NEXT: paddb %xmm4, %xmm0
-; SSE-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE-NEXT: psrlw $2, %xmm0
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: pmulhw %xmm3, %xmm1
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: packuswb %xmm2, %xmm1
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlw $2, %xmm1
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm1
+; SSE-NEXT: psrlw $7, %xmm0
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: paddb %xmm1, %xmm0
; SSE-NEXT: psubb %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -196,17 +197,18 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -218,13 +220,13 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2NOBW-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; AVX2NOBW-NEXT: vzeroupper
; AVX2NOBW-NEXT: retq
@@ -236,13 +238,13 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -262,25 +264,26 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; SSE-NEXT: psrlw $8, %xmm3
-; SSE-NEXT: packuswb %xmm2, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: packuswb %xmm2, %xmm1
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: paddb %xmm3, %xmm0
+; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE-NEXT: psraw $8, %xmm1
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,64,128,32,64,128,128,64]
+; SSE-NEXT: psrlw $8, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: psraw $8, %xmm2
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,128,64,32,128,64,32]
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,64,128,64,32,128,64,32]
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm2
+; SSE-NEXT: psrlw $7, %xmm0
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: paddb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_divconstant_16i8:
@@ -289,23 +292,24 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,64,128,32,64,128,128,64]
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,64,128,32,64,128,128,64]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,64,128,64,32,128,64,32]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [64,64,128,64,32,128,64,32]
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2NOBW-LABEL: test_divconstant_16i8:
@@ -322,9 +326,9 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX2NOBW-NEXT: vpsubb %xmm0, %xmm1, %xmm0
+; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX2NOBW-NEXT: vzeroupper
; AVX2NOBW-NEXT: retq
;
@@ -340,9 +344,9 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2
; AVX512BW-NEXT: vpsravw %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
@@ -564,24 +568,25 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE-NEXT: pmulhw %xmm3, %xmm4
-; SSE-NEXT: psrlw $8, %xmm4
-; SSE-NEXT: packuswb %xmm2, %xmm4
-; SSE-NEXT: paddb %xmm0, %xmm4
-; SSE-NEXT: pcmpgtb %xmm4, %xmm1
-; SSE-NEXT: psrlw $2, %xmm4
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; SSE-NEXT: pxor %xmm2, %xmm4
-; SSE-NEXT: psubb %xmm1, %xmm4
-; SSE-NEXT: psubb %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: psllw $3, %xmm1
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT: pmulhw %xmm3, %xmm1
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: packuswb %xmm2, %xmm1
+; SSE-NEXT: paddb %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psrlw $2, %xmm2
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE-NEXT: pxor %xmm3, %xmm2
+; SSE-NEXT: psrlw $7, %xmm1
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: psubb %xmm1, %xmm4
-; SSE-NEXT: paddb %xmm4, %xmm0
+; SSE-NEXT: paddb %xmm2, %xmm1
+; SSE-NEXT: psubb %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: psllw $3, %xmm2
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT: psubb %xmm2, %xmm1
+; SSE-NEXT: paddb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_rem7_16i8:
@@ -591,17 +596,18 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -617,13 +623,13 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2NOBW-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -639,13 +645,13 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
@@ -669,26 +675,27 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[...
[truncated]
|
omkar-mohanty
pushed a commit
to omkar-mohanty/llvm-project
that referenced
this pull request
Jun 8, 2025
…8 vectors" (llvm#143303) Reverts llvm#143106 as this is causing infinite loops in llvm#143238
# for free
to join this conversation on GitHub.
Already have an account?
# to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Reverts #143106 as this is causing infinite loops in #143238