Ensure that Sse3.MoveAndDuplicate correctly tracks supporting SIMD scalar loads (#97783)

tannergooding · web-flow · commit e3ff66bf5afd · 2024-01-31T15:52:30.000-08:00
diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp
@@ -7866,16 +7866,16 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
                 case NI_AVX512F_BroadcastVector128ToVector512:
                 case NI_AVX512F_BroadcastVector256ToVector512:
                 {
+                    assert(!supportsSIMDScalarLoads);
+
                     if (parentNode->OperIsMemoryLoad())
                     {
                         supportsGeneralLoads = !childNode->OperIsHWIntrinsic();
                         break;
                     }
-                    else
-                    {
-                        supportsGeneralLoads = true;
-                        break;
-                    }
+
+                    supportsGeneralLoads = true;
+                    break;
                 }
 
                 case NI_SSE41_ConvertToVector128Int16:
@@ -7941,26 +7941,47 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
                 }
 
                 case NI_SSE2_ConvertToVector128Double:
-                case NI_SSE3_MoveAndDuplicate:
                 case NI_AVX_ConvertToVector256Double:
+                case NI_AVX512F_ConvertToVector512Double:
+                case NI_AVX512F_VL_ConvertToVector128Double:
+                case NI_AVX512F_VL_ConvertToVector256Double:
                 {
                     assert(!supportsSIMDScalarLoads);
 
                     // Most instructions under the non-VEX encoding require aligned operands.
                     // Those used for Sse2.ConvertToVector128Double (CVTDQ2PD and CVTPS2PD)
-                    // and Sse3.MoveAndDuplicate (MOVDDUP) are exceptions and don't fail for
-                    // unaligned inputs as they read mem64 (half the vector width) instead
+                    // are exceptions and don't fail for unaligned inputs as they read half
+                    // the vector width instead
 
                     supportsAlignedSIMDLoads   = !comp->opts.MinOpts();
                     supportsUnalignedSIMDLoads = true;
 
                     const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / 2;
                     const unsigned operandSize  = genTypeSize(childNode->TypeGet());
 
-                    // For broadcasts we can only optimize constants and memory operands
-                    const bool broadcastIsContainable = childNode->OperIsConst() || childNode->isMemoryOp();
-                    supportsGeneralLoads =
-                        broadcastIsContainable && supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
+                    if (childNode->OperIsConst() || childNode->isMemoryOp())
+                    {
+                        // For broadcasts we can only optimize constants and memory operands
+                        // since we're going from a smaller base type to a larger base type
+                        supportsGeneralLoads = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
+                    }
+                    break;
+                }
+
+                case NI_SSE3_MoveAndDuplicate:
+                {
+                    // Most instructions under the non-VEX encoding require aligned operands.
+                    // Those used for Sse3.MoveAndDuplicate (MOVDDUP) are exceptions and don't
+                    // fail for unaligned inputs as they read half the vector width instead
+
+                    supportsAlignedSIMDLoads   = !comp->opts.MinOpts();
+                    supportsUnalignedSIMDLoads = true;
+
+                    const unsigned expectedSize = genTypeSize(parentNode->TypeGet()) / 2;
+                    const unsigned operandSize  = genTypeSize(childNode->TypeGet());
+
+                    supportsGeneralLoads    = supportsUnalignedSIMDLoads && (operandSize >= expectedSize);
+                    supportsSIMDScalarLoads = true;
                     break;
                 }
 
@@ -7986,8 +8007,6 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre
                     break;
                 }
             }
-
-            assert(supportsSIMDScalarLoads == false);
             break;
         }