Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

[Mono] Enable the supported V128 SIMD intrinsics on Arm64 across all codegen engines #84289

Merged
merged 6 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1111,8 +1111,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
/* NEON :: extract */
#define arm_neon_extr_opcode(p, q, op2, imm4, rd, rn, rm) arm_neon_opcode_3reg ((p), (q), 0b00101110000000000000000000000000 | (op2) << 22 | (imm4) << 11, (rd), (rn), (rm))

#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rm))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rm))

/* NEON :: copy */
#define arm_neon_cpy_opcode(p, q, op, imm5, imm4, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110000000000000010000000000 | (op) << 29 | (imm5) << 16 | (imm4) << 11, (rd), (rn))
Expand Down
2 changes: 2 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,8 @@ expand_i4: dest:x src1:i len:4
expand_i8: dest:x src1:i len:4
expand_r4: dest:x src1:f len:4
expand_r8: dest:x src1:f len:4
create_scalar: dest:x src1:i len:12
create_scalar_unsafe: dest:x src1:i len:4

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
147 changes: 93 additions & 54 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3717,6 +3717,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;
case OP_STOREX_MEMBASE:
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
break;
Expand All @@ -3730,10 +3772,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
arm_ldrx_lit (code, ARMREG_IP0, 0);
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
arm_ldrfpq (code, dreg, ARMREG_IP0, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
arm_neon_ldrq_lit (code, ins->dreg, 0);
arm_neon_ldrq_lit (code, dreg, 0);
}
break;
}
Expand All @@ -3744,13 +3786,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_EXPAND_I4:
case OP_EXPAND_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1);
arm_neon_dup_g (code, VREG_FULL, t, dreg, sreg1);
break;
}
case OP_EXPAND_R4:
case OP_EXPAND_R8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, 0);
break;
}
case OP_EXTRACT_I1:
Expand All @@ -3760,9 +3802,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
const int t = get_type_size_macro (ins->inst_c1);
// smov is not defined for i64
if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_umov (code, t, dreg, sreg1, ins->inst_c0);
} else {
arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_smov (code, t, dreg, sreg1, ins->inst_c0);
}
break;
}
Expand All @@ -3773,17 +3815,27 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
// Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
// set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
// of the F/XREG is ignored in FREG mode, this operation remains valid.
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, ins->inst_c0);
}
break;
case OP_INSERT_I1:
case OP_INSERT_I2:
case OP_INSERT_I4:
case OP_INSERT_I8:
case OP_INSERT_R4:
case OP_INSERT_R8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0);
break;
}
case OP_ARM64_XADDV: {
switch (ins->inst_c0) {
case INTRINS_AARCH64_ADV_SIMD_FADDV:
if (ins->inst_c1 == MONO_TYPE_R8) {
arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F64, dreg, sreg1, sreg1);
} else if (ins->inst_c1 == MONO_TYPE_R4) {
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, sreg1, sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, dreg, dreg);
} else {
g_assert_not_reached ();
}
Expand All @@ -3792,7 +3844,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case INTRINS_AARCH64_ADV_SIMD_UADDV:
case INTRINS_AARCH64_ADV_SIMD_SADDV:
if (get_type_size_macro (ins->inst_c1) == TYPE_I64)
arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_addp (code, VREG_FULL, TYPE_I64, dreg, sreg1, sreg1);
else
g_assert_not_reached (); // remaining int types are handled through the codegen table
break;
Expand All @@ -3802,6 +3854,36 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_CREATE_SCALAR: {
int t = get_type_size_macro (ins->inst_c1);
switch (ins->inst_c1) {
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
if (is_type_float_macro (ins->inst_c1)) {
// ins expects an integer register
arm_fmov_double_to_rx(code, NEON_TMP_REG, sreg1);
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_ins_g(code, t, dreg, NEON_TMP_REG, 0);
} else {
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
}
break;
}
case OP_CREATE_SCALAR_UNSAFE: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
break;
}
// Enable this when adding support for Narrow and enable support for Create at the same time
// case OP_XCONCAT:
// arm_neon_ext_16b(code, dreg, sreg1, sreg2, 8);
// break;

/* BRANCH */
case OP_BR:
Expand Down Expand Up @@ -3875,49 +3957,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
arm_cbnzx (code, sreg1, 0);
break;

/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;

/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down
96 changes: 31 additions & 65 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -1097,11 +1097,6 @@ static guint16 sri_vector_methods [] = {
SN_AsUInt16,
SN_AsUInt32,
SN_AsUInt64,
SN_AsVector128,
SN_AsVector2,
SN_AsVector256,
SN_AsVector3,
SN_AsVector4,
SN_BitwiseAnd,
SN_BitwiseOr,
SN_Ceiling,
Expand Down Expand Up @@ -1150,8 +1145,6 @@ static guint16 sri_vector_methods [] = {
SN_ToScalar,
SN_ToVector128,
SN_ToVector128Unsafe,
SN_ToVector256,
SN_ToVector256Unsafe,
SN_WidenLower,
SN_WidenUpper,
SN_WithElement,
Expand Down Expand Up @@ -1216,76 +1209,47 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
if (!COMPILE_LLVM (cfg))
return NULL;
#endif
// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
#ifdef TARGET_ARM64
if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp))
return NULL;
#endif

int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod);
if (id == -1) {
//check_no_intrinsic_cattr (cmethod);
return NULL;
}

if (!strcmp (m_class_get_name (cfg->method->klass), "Vector256") || !strcmp (m_class_get_name (cfg->method->klass), "Vector512"))
if (!strcmp (m_class_get_name (cmethod->klass), "Vector256") || !strcmp (m_class_get_name (cmethod->klass), "Vector512"))
return NULL;

// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
#ifdef TARGET_ARM64
if (!COMPILE_LLVM (cfg)) {
if (!(!strcmp (m_class_get_name (cmethod->klass), "Vector128") || !strcmp (m_class_get_name (cmethod->klass), "Vector")))
return NULL;
switch (id) {
case SN_Add:
case SN_Equals:
case SN_GreaterThan:
case SN_GreaterThanOrEqual:
case SN_LessThan:
case SN_LessThanOrEqual:
case SN_Negate:
case SN_OnesComplement:
case SN_EqualsAny:
case SN_GreaterThanAny:
case SN_GreaterThanOrEqualAny:
case SN_LessThanAny:
case SN_LessThanOrEqualAny:
case SN_EqualsAll:
case SN_GreaterThanAll:
case SN_GreaterThanOrEqualAll:
case SN_LessThanAll:
case SN_LessThanOrEqualAll:
case SN_Subtract:
case SN_BitwiseAnd:
case SN_BitwiseOr:
case SN_Xor:
case SN_As:
case SN_AsByte:
case SN_AsDouble:
case SN_AsInt16:
case SN_AsInt32:
case SN_AsInt64:
case SN_AsSByte:
case SN_AsSingle:
case SN_AsUInt16:
case SN_AsUInt32:
case SN_AsUInt64:
case SN_Max:
case SN_Min:
case SN_Sum:
case SN_ToScalar:
case SN_Floor:
case SN_Ceiling:
case SN_Divide:
case SN_Multiply:
case SN_Sqrt:
case SN_Abs:
break;
default:
case SN_AndNot:
case SN_ConditionalSelect:
case SN_ConvertToDouble:
case SN_ConvertToInt32:
case SN_ConvertToInt64:
case SN_ConvertToSingle:
case SN_ConvertToUInt32:
case SN_ConvertToUInt64:
case SN_Create:
case SN_Dot:
case SN_ExtractMostSignificantBits:
case SN_GetElement:
case SN_GetLower:
case SN_GetUpper:
case SN_Narrow:
case SN_Shuffle:
case SN_ToVector128:
case SN_ToVector128Unsafe:
case SN_WidenLower:
case SN_WidenUpper:
case SN_WithElement:
return NULL;
default:
break;
}
MonoClass *arg0_class = mono_class_from_mono_type_internal (fsig->params [0]);
int class_size = mono_class_value_size (arg0_class, NULL);
if (class_size != 16)
return NULL;
}
#endif

Expand Down Expand Up @@ -1462,9 +1426,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
MonoType *etype = get_vector_t_elem_type (fsig->ret);
if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
return NULL;
if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
return emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
else if (is_create_from_half_vectors_overload (fsig))
if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) {
MonoInst* ins = emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
ins->inst_c1 = arg0_type;
return ins;
} else if (is_create_from_half_vectors_overload (fsig))
return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg);
else if (is_elementwise_create_overload (fsig, etype))
return emit_vector_create_elementwise (cfg, fsig, fsig->ret, etype, args);
Expand Down