[mono] Check for `Vector{64,128,256}<T>` element type validity before emitting unchecked intrinsic IR for `AsByte` etc. #53707

imhameed · 2021-06-04T01:41:00Z

No description provided.

… emitting unchecked intrinsic IR for `AsByte` etc.

tannergooding · 2021-06-04T02:20:31Z

src/mono/mono/mini/simd-intrinsics.c

@@ -560,8 +559,12 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 	case SN_AsSingle:
 	case SN_AsUInt16:
 	case SN_AsUInt32:
-	case SN_AsUInt64:
+	case SN_AsUInt64: {


Are the relevant HWIntrinsic tests covering this still disabled for Mono?

I would've expected one or more of the tests under https://github.com/dotnet/runtime/tree/main/src/tests/JIT/HardwareIntrinsics/General/NotSupported to have caught this. https://github.com/dotnet/runtime/blob/main/src/tests/JIT/HardwareIntrinsics/General/NotSupported/Vector128BooleanAsGeneric_UInt64.cs is the one covering the specific example from the related issue.

They've been enabled since #44930. These tests pass because we currently implement operator == for Type via an intrinsic. This, combined with the fact that we currently link Mono with LLVM built with assertions disabled, means that this:

define dso_local monocc i128 @"System.Runtime.Intrinsics.Vector128:WithLower<bool> (System.Runtime.Intrinsics.Vector128`1<bool>,System.Runtime.Intrinsics.Vector64`1<bool>)"([2 x i64] %arg_vector, [1 x i64] %arg_value) #0 gc "coreclr" { BB0: %0 = alloca %"System.Runtime.Intrinsics.Vector64`1<bool>", align 8 %1 = alloca %"System.Runtime.Intrinsics.Vector128`1<bool>", align 8 %2 = alloca %"System.Runtime.Intrinsics.Vector128`1<bool>", align 8 %3 = alloca %"System.Runtime.Intrinsics.Vector128`1<bool>", align 8 %4 = alloca <2 x i64>, align 16 %5 = alloca %"System.Runtime.Intrinsics.Vector64`1<bool>", align 8 %6 = alloca %"System.Runtime.Intrinsics.Vector128`1<bool>", align 8 %7 = alloca %"System.Runtime.Intrinsics.Vector128`1<bool>", align 8 %8 = bitcast %"System.Runtime.Intrinsics.Vector128`1<bool>"* %1 to [2 x i64]* store [2 x i64] %arg_vector, [2 x i64]* %8 %9 = bitcast %"System.Runtime.Intrinsics.Vector64`1<bool>"* %0 to [1 x i64]* store [1 x i64] %arg_value, [1 x i64]* %9 br label %BB3 BB3: ; preds = %BB0 br label %BB2 BB2: ; preds = %BB3 br i1 false, label %BB4, label %BB5 BB4: ; preds = %BB2 %10 = load i128 ([2 x i64], [1 x i64])*, i128 ([2 x i64], [1 x i64])** @"[tramp_4428] System.Runtime.Intrinsics.Vector128:<WithLower>g__SoftwareFallback|73_0<bool> (System.Runtime.Intrinsics.Vector128`1<bool>,System.Runtime.Intrinsics.Vector64`1<bool>)" %11 = bitcast %"System.Runtime.Intrinsics.Vector128`1<bool>"* %1 to [2 x i64]* %12 = load [2 x i64], [2 x i64]* %11 %13 = bitcast %"System.Runtime.Intrinsics.Vector64`1<bool>"* %0 to [1 x i64]* %14 = load [1 x i64], [1 x i64]* %13 %15 = notail call monocc i128 %10([2 x i64] %12, [1 x i64] %14), !managed_name !0 %16 = bitcast %"System.Runtime.Intrinsics.Vector128`1<bool>"* %2 to i128* store i128 %15, i128* %16 br label %BB1 BB5: ; preds = %BB2 %17 = bitcast [2 x i64] %arg_vector to <2 x i64> %18 = bitcast [1 x i64] %arg_value to <1 x i64> %extract = extractelement <1 x i64> %18, i32 0 %xinsert = insertelement <2 x i64> %17, i64 %extract, i32 0 store <2 x i64> %xinsert, <2 x i64>* %4 br label %BB9 BB1: ; preds = %BB16, %BB4 %19 = phi %"System.Runtime.Intrinsics.Vector128`1<bool>"* [ %3, %BB16 ], [ %2, %BB4 ] %20 = bitcast %"System.Runtime.Intrinsics.Vector128`1<bool>"* %19 to i128* %21 = load i128, i128* %20 ret i128 %21 BB9: ; preds = %BB5 br label %BB14 BB14: ; preds = %BB9 br i1 false, label %BB16, label %BB17 BB16: ; preds = %BB25, %BB24, %BB23, %BB22, %BB21, %BB20, %BB19, %BB18, %BB17, %BB14 %22 = ptrtoint <2 x i64>* %4 to i64 %23 = add i64 %22, 0 %24 = inttoptr i64 %23 to i8* %25 = bitcast %"System.Runtime.Intrinsics.Vector128`1<bool>"* %3 to i8* call void @llvm.memcpy.p0i8.p0i8.i32(i8* %25, i8* %24, i32 16, i1 false) br label %BB1 BB17: ; preds = %BB14 br i1 false, label %BB16, label %BB18 BB18: ; preds = %BB17 br i1 false, label %BB16, label %BB19 BB19: ; preds = %BB18 br i1 false, label %BB16, label %BB20 BB20: ; preds = %BB19 br i1 false, label %BB16, label %BB21 BB21: ; preds = %BB20 br i1 false, label %BB16, label %BB22 BB22: ; preds = %BB21 br i1 false, label %BB16, label %BB23 BB23: ; preds = %BB22 br i1 false, label %BB16, label %BB24 BB24: ; preds = %BB23 br i1 false, label %BB16, label %BB25 BB25: ; preds = %BB24 %26 = call i1 @llvm.expect.i1(i1 false, i1 true) br i1 %26, label %BB16, label %BB26 BB26: ; preds = %BB25 %27 = load void (i32)*, void (i32)** @"[tramp_4429] System.ThrowHelper:ThrowNotSupportedException (System.ExceptionResource)" notail call monocc void %27(i32 64), !managed_name !1 unreachable }

is optimized into this:

define dso_local monocc i128 @"System.Runtime.Intrinsics.Vector128:WithLower<bool> (System.Runtime.Intrinsics.Vector128`1<bool>,System.Runtime.Intrinsics.Vector64`1<bool>)"([2 x i64] %arg_vector, [1 x i64] %arg_value) #0 gc "coreclr" { BB26: %0 = load void (i32)*, void (i32)** @"[tramp_4429] System.ThrowHelper:ThrowNotSupportedException (System.ExceptionResource)", align 8 %1 = load i64, i64* inttoptr (i64 281473709282408 to i64*) %2 = icmp eq i64 %1, 0 br i1 %2, label %gc.safepoint_poll.exit, label %gc.safepoint_poll.poll.i, !prof !1 gc.safepoint_poll.poll.i: ; preds = %BB26 %3 = load void ()*, void ()** @mono_threads_state_poll call void %3() #2 br label %gc.safepoint_poll.exit gc.safepoint_poll.exit: ; preds = %BB26, %gc.safepoint_poll.poll.i notail call monocc void %0(i32 64), !managed_name !2 unreachable }

Because we don't run tests with LLVM assertions enabled, we won't fail-fast when emitting an invalid bitcast.

#52791 changes this; we don't inline this managed implementation of operator ==, and the invalid bitcasts aren't elided. So the test later fails when attempting to do isel for a bogus bitcast.

These tests pass because we currently implement operator == for Type via an intrinsic.

Sorry, I'm a bit confused here as to why this means the tests "pass". The tests in question are doing code similar to the following:

bool succeeded = false; try { Vector128<ulong> result = default(Vector128<bool>).As<bool, ulong>(); } catch (NotSupportedException) { succeeded = true; } if (!succeeded) { TestLibrary.TestFramework.LogInformation($"Vector128BooleanAsGeneric_UInt64: RunNotSupportedScenario failed to throw NotSupportedException."); TestLibrary.TestFramework.LogInformation(string.Empty); throw new Exception("One or more scenarios did not complete as expected."); }

Given this, I'd expect NotSupportedException to be throw for any invalid T being used with As. However, as seen with #53132 and with what this PR is fixing, unsupported T were being allowed and invalid IR generated.

That is, I'm not connecting the dots here on why these tests "pass" but public static Vector128<T> WithLower<T>(this Vector128<T> vector, Vector64<T> value was getting invalid LLVM IR.

Here's the definition of WithLower:

[MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128<T> WithLower<T>(this Vector128<T> vector, Vector64<T> value) where T : struct { if (AdvSimd.IsSupported) { return AdvSimd.InsertScalar(vector.AsUInt64(), 0, value.AsUInt64()).As<ulong, T>(); } return SoftwareFallback(vector, value); static Vector128<T> SoftwareFallback(Vector128<T> vector, Vector64<T> value) { ThrowHelper.ThrowForUnsupportedIntrinsicsVectorBaseType<T>(); Vector128<T> result = vector; Unsafe.As<Vector128<T>, Vector64<T>>(ref result) = value; return result; } }

Today, on arm64, for function parameters, we map Vector128<T> to [2 x i64] and Vector64<T> to [1 x i64], regardless of the validity of the element type.

On function entry, when T is supported, we construct an LLVM vector of the right shape via a sequence of loads and insertelements, and then we associate the resulting LLVM vector value with the mini IR register that represents that function parameter. The rest of our intrinsics-related code assumes that vectors are normal LLVM values.

When T is unsupprted, we manipulate it using our generic "value type" convention, which represents instances of value types via an indirection and uses loads and stores to manipulate different fields. In this case the mini IR registers associated with each function parameter are mapped directly to these array-typed LLVM function parameters (these are promptly stored in allocad memory for later "value type" use). My previous PR added a hack to stub in an LLVM undef value for .NET vector types with unsupported element types.

On arm64, AdvSimd.IsSupported is implemented via an intrinsic that binds a constant 1 to a mini IR register.

Vector128:AsUInt64, before this change, was implemented as an intrinsic that emits an unchecked mini IR OP_XCAST, even for unsupported T. This is translated to an LLVM bitcast, under the assumption that the source mini IR register in the originating OP_XCAST is associated with a valid LLVM vector value. This is the source of

%17 = bitcast [2 x i64] %arg_vector to <2 x i64> %18 = bitcast [1 x i64] %arg_value to <1 x i64>

in the unoptimized LLVM IR in my previous comment.

As is later inlined. We (contrary to my incorrect PR description in my now-closed PR) don't intrinsify As. So we then translate two inlined invocations of ThrowForUnsupportedIntrinsicsVectorBaseType. Because we happen to be JIT-compiling this System.Private.CoreLib.dll code (rather than AOT compiling it--the only things AOTed right now in the llvmaot test lanes are the individual runtime test assemblies), the exact addresses of all associated Type instances are "JIT-statically" known in each invocation of ThrowForUnsupportedIntrinsicsVectorBaseType. The intrinsified inequality comparisons between JIT-time constants are all true and are "optimized" via mini IR-specific passes that run before translation to LLVM IR. This is the source of

BB17: ; preds = %BB14 br i1 false, label %BB16, label %BB18 BB18: ; preds = %BB17 br i1 false, label %BB16, label %BB19 BB19: ; preds = %BB18 br i1 false, label %BB16, label %BB20 BB20: ; preds = %BB19 br i1 false, label %BB16, label %BB21 BB21: ; preds = %BB20 br i1 false, label %BB16, label %BB22 BB22: ; preds = %BB21 br i1 false, label %BB16, label %BB23 BB23: ; preds = %BB22 br i1 false, label %BB16, label %BB24 BB24: ; preds = %BB23 br i1 false, label %BB16, label %BB25 BB25: ; preds = %BB24 %26 = call i1 @llvm.expect.i1(i1 false, i1 true) br i1 %26, label %BB16, label %BB26 BB26: ; preds = %BB25 %27 = load void (i32)*, void (i32)** @"[tramp_4429] System.ThrowHelper:ThrowNotSupportedException (System.ExceptionResource)" notail call monocc void %27(i32 64), !managed_name !1 unreachable

in the unoptimized LLVM IR in my previous comment.

The passes that we've enabled for LLVM as a JIT then apparently elide all side-effect-free instructions (including the invalid AsUInt64 bitcasts), even if they're located in basic blocks that dominate the basic block that contains the invocation of System.ThrowHelper:ThrowNotSupportedException. So we've been generating invalid IR for a while but because our implementation of Type operator == is an intrinsic (and because we're JIT compiling functions from System.Private.CoreLib.dll, which I should have mentioned in my previous comment), this invalid IR has been getting optimized away. This happened to make these tests "pass".

If we were running these tests with an assertion-enabled LLVM, this would have been caught much earlier.

Today, on arm64, for function parameters, we map Vector128 to [2 x i64] and Vector64 to [1 x i64], regardless of the validity of the element type.

This seems potentially problematic for ABI handling since vector types get special handling. In particular Vector128<T>, for a valid T, is contracted to map to the __m128 ABI type on x86/x64 (there are corresponding ABI type for Vector256 and similar types for Vector64/Vector128 on ARM32/ARM64). This can also be impactful for understanding of things like HFA vs HVA as struct S { float x, y, z, w; } and __m128 are different types on some platforms, as are 2 x i64 vs 4 x float.

Is there any particular reason this isn't just specially handling valid types as a n x ... element vector and handling unsupported types as a regular struct definition, so that things "work" and match the type and ABI handling that RyuJIT has?

I have no idea why our calling convention bounces Vector{64,128,256}<T> through GPRs or through memory. This is something I'd like to improve.

EDIT: Probably most accurate to say that the work to make the CC computation code SIMD-aware just hasn't happened yet, so all the existing code piggybacks off the existing support for "value types".

The passing/receiving of simd types as simd is just not implemented right now, there is no particular reason for it afaik.

Thanks, I was mostly trying to figure out why it was that things were handled as SIMD at all for unrecognized types.

In RyuJIT, we specially recognize a few T and everything else just falls back to as if it were any plain old struct type.

[mono] Check for Vector{64,128,256}<T> element type validity before…

67aa026

… emitting unchecked intrinsic IR for `AsByte` etc.

imhameed requested review from CoffeeFlux, lambdageek, SamMonoRT and vargaz as code owners June 4, 2021 01:41

dotnet-issue-labeler bot added the area-Codegen-meta-mono label Jun 4, 2021

This was referenced Jun 4, 2021

[mono] Generate valid LLVM IR for vectors with unsupported element types #53132

Closed

[mono] Fix Type equality comparisons #52791

Closed

tannergooding reviewed Jun 4, 2021

View reviewed changes

vargaz approved these changes Jun 5, 2021

View reviewed changes

imhameed merged commit b854c93 into dotnet:main Jun 5, 2021

ghost locked as resolved and limited conversation to collaborators Jul 5, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[mono] Check for `Vector{64,128,256}<T>` element type validity before emitting unchecked intrinsic IR for `AsByte` etc. #53707

[mono] Check for `Vector{64,128,256}<T>` element type validity before emitting unchecked intrinsic IR for `AsByte` etc. #53707

imhameed commented Jun 4, 2021

tannergooding Jun 4, 2021

imhameed Jun 4, 2021 •

edited

Loading

tannergooding Jun 4, 2021

imhameed Jun 4, 2021 •

edited

Loading

tannergooding Jun 4, 2021

imhameed Jun 4, 2021 •

edited

Loading

vargaz Jun 5, 2021

tannergooding Jun 5, 2021

[mono] Check for Vector{64,128,256}<T> element type validity before emitting unchecked intrinsic IR for AsByte etc. #53707

[mono] Check for Vector{64,128,256}<T> element type validity before emitting unchecked intrinsic IR for AsByte etc. #53707

Conversation

imhameed commented Jun 4, 2021

tannergooding Jun 4, 2021

Choose a reason for hiding this comment

imhameed Jun 4, 2021 • edited Loading

Choose a reason for hiding this comment

tannergooding Jun 4, 2021

Choose a reason for hiding this comment

imhameed Jun 4, 2021 • edited Loading

Choose a reason for hiding this comment

tannergooding Jun 4, 2021

Choose a reason for hiding this comment

imhameed Jun 4, 2021 • edited Loading

Choose a reason for hiding this comment

vargaz Jun 5, 2021

Choose a reason for hiding this comment

tannergooding Jun 5, 2021

Choose a reason for hiding this comment

[mono] Check for `Vector{64,128,256}<T>` element type validity before emitting unchecked intrinsic IR for `AsByte` etc. #53707

[mono] Check for `Vector{64,128,256}<T>` element type validity before emitting unchecked intrinsic IR for `AsByte` etc. #53707

imhameed Jun 4, 2021 •

edited

Loading

imhameed Jun 4, 2021 •

edited

Loading

imhameed Jun 4, 2021 •

edited

Loading