From 7025ac81f05c90e0beee67755d81f5be7c508e52 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 29 Jun 2023 15:37:09 +0200
Subject: [PATCH] [X86] Don't elide argument copies for scalarized vectors
 (PR63475)

When eliding argument copies, the memory layout between a plain
store of the type and the layout of the argument lowering on the
stack must match. For multi-part argument lowerings, this is not
necessarily the case.

The code already tried to prevent this optimization for "scalarized
and extended" vectors, but the check for "extends" was incomplete.
While a scalarized vector of i32s stores i32 values on the stack,
these are stored in 8 byte stack slots (on x86_64), so effectively
have padding.

Rather than trying to add more special cases to handle this (which
is not straightforward), I'm going in the other direction and
exclude scalarized vectors from this optimization entirely. This
seems like a rare case that is not worth the hassle -- the complete
lack of test coverage is not reassuring either.

Fixes https://github.com/llvm/llvm-project/issues/63475.

Differential Revision: https://reviews.llvm.org/D154078
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++----
 llvm/test/CodeGen/X86/pr63475.ll        | 46 ++++++++++++++++---------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 43b5a20006ba7..92d1bd1bb0252 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3851,13 +3851,10 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
 
   EVT ArgVT = Ins[i].ArgVT;
 
-  // If this is a vector that has been split into multiple parts, and the
-  // scalar size of the parts don't match the vector element size, then we can't
-  // elide the copy. The parts will have padding between them instead of being
-  // packed like a vector.
-  bool ScalarizedAndExtendedVector =
-      ArgVT.isVector() && !VA.getLocVT().isVector() &&
-      VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
+  // If this is a vector that has been split into multiple parts, don't elide
+  // the copy. The layout on the stack may not match the packed in-memory
+  // layout.
+  bool ScalarizedVector = ArgVT.isVector() && !VA.getLocVT().isVector();
 
   // This is an argument in memory. We might be able to perform copy elision.
   // If the argument is passed directly in memory without any extension, then we
@@ -3865,7 +3862,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   // indirectly by pointer.
   if (Flags.isCopyElisionCandidate() &&
       VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
-      !ScalarizedAndExtendedVector) {
+      !ScalarizedVector) {
     SDValue PartAddr;
     if (Ins[i].PartOffset == 0) {
       // If this is a one-part value or the first part of a multi-part value,
diff --git a/llvm/test/CodeGen/X86/pr63475.ll b/llvm/test/CodeGen/X86/pr63475.ll
index d4b7a7cacefda..0052688b5aa13 100644
--- a/llvm/test/CodeGen/X86/pr63475.ll
+++ b/llvm/test/CodeGen/X86/pr63475.ll
@@ -27,7 +27,8 @@ define void @caller() nounwind {
   ret void
 }
 
-; FIXME: This is a miscompile.
+; Make sure the stack offsets are correct. The distance between them should
+; be 8, not 4.
 define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i32> %arg) nounwind {
 ; CHECK-LABEL: callee:
 ; CHECK:       # %bb.0: # %start
@@ -37,28 +38,41 @@ define void @callee(ptr %p0, ptr %p1, ptr %p2, ptr %p3, ptr %p4, ptr %p5, <7 x i
 ; CHECK-NEXT:    pushq %r13
 ; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl 112(%rsp), %ebx
-; CHECK-NEXT:    movl 104(%rsp), %ebp
-; CHECK-NEXT:    movl 96(%rsp), %r14d
-; CHECK-NEXT:    movl 76(%rsp), %r15d
-; CHECK-NEXT:    movl 72(%rsp), %r12d
-; CHECK-NEXT:    movl 64(%rsp), %edi
-; CHECK-NEXT:    movl 68(%rsp), %r13d
-; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    movl %r13d, %edi
-; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movl 120(%rsp), %ebx
+; CHECK-NEXT:    movd %ebx, %xmm0
+; CHECK-NEXT:    movl 112(%rsp), %ebp
+; CHECK-NEXT:    movd %ebp, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movl 104(%rsp), %r15d
+; CHECK-NEXT:    movd %r15d, %xmm0
+; CHECK-NEXT:    movl 96(%rsp), %edi
+; CHECK-NEXT:    movd %edi, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; CHECK-NEXT:    movl 136(%rsp), %r14d
+; CHECK-NEXT:    movd %r14d, %xmm0
+; CHECK-NEXT:    movl 128(%rsp), %r12d
+; CHECK-NEXT:    movd %r12d, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movl 144(%rsp), %r13d
+; CHECK-NEXT:    movl %r13d, 36(%rsp)
+; CHECK-NEXT:    movq %xmm1, 28(%rsp)
+; CHECK-NEXT:    movdqu %xmm2, 12(%rsp)
 ; CHECK-NEXT:    callq use@PLT
 ; CHECK-NEXT:    movl %r15d, %edi
 ; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    movl %r14d, %edi
-; CHECK-NEXT:    callq use@PLT
 ; CHECK-NEXT:    movl %ebp, %edi
 ; CHECK-NEXT:    callq use@PLT
 ; CHECK-NEXT:    movl %ebx, %edi
 ; CHECK-NEXT:    callq use@PLT
-; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    movl %r12d, %edi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    movl %r14d, %edi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    movl %r13d, %edi
+; CHECK-NEXT:    callq use@PLT
+; CHECK-NEXT:    addq $40, %rsp
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r13