Skip to content

8360116: Add support for AVX10 floating point minmax instruction #25914

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8257,6 +8257,14 @@ void Assembler::vmaxsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
emit_int16(0x5F, (0xC0 | encode));
}

void Assembler::eminmaxsh(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) {
assert(VM_Version::supports_avx10_2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x53, (0xC0 | encode), imm8);
}

void Assembler::vminsh(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
Expand Down Expand Up @@ -8771,12 +8779,68 @@ void Assembler::vmaxps(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve
emit_int16(0x5F, (0xC0 | encode));
}

void Assembler::evminmaxps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int imm8, int vector_len) {
assert(VM_Version::supports_avx10_2(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x52, (0xC0 | encode), imm8);
}

void Assembler::evminmaxps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int imm8, int vector_len) {
assert(VM_Version::supports_avx10_2(), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
if (merge) {
attributes.reset_is_clear_context();
}
vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x52);
emit_operand(dst, src, 0);
emit_int8(imm8);
}

void Assembler::maxpd(XMMRegister dst, XMMRegister src) {
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x5F, (0xC0 | encode));
}

void Assembler::evminmaxpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int imm8, int vector_len) {
assert(VM_Version::supports_avx10_2(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x52, (0xC0 | encode), imm8);
}

void Assembler::evminmaxpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int imm8, int vector_len) {
assert(VM_Version::supports_avx10_2(), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
if (merge) {
attributes.reset_is_clear_context();
}
vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x52);
emit_operand(dst, src, 0);
emit_int8(imm8);
}

void Assembler::vmaxpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(vector_len >= AVX_512bit ? VM_Version::supports_evex() : VM_Version::supports_avx(), "");
InstructionAttr attributes(vector_len, /* vex_w */true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
Expand Down Expand Up @@ -13119,6 +13183,14 @@ void Assembler::vminss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
emit_int16(0x5D, (0xC0 | encode));
}

void Assembler::eminmaxss(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) {
assert(VM_Version::supports_avx10_2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x53, (0xC0 | encode), imm8);
}

void Assembler::vminsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
Expand All @@ -13127,6 +13199,14 @@ void Assembler::vminsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
emit_int16(0x5D, (0xC0 | encode));
}

void Assembler::eminmaxsd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8) {
assert(VM_Version::supports_avx10_2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x53, (0xC0 | encode), imm8);
}

void Assembler::vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len) {
assert(VM_Version::supports_avx(), "");
assert(vector_len <= AVX_256bit, "");
Expand Down Expand Up @@ -16526,6 +16606,34 @@ void Assembler::evminph(XMMRegister dst, XMMRegister nds, Address src, int vecto
emit_operand(dst, src, 0);
}

void Assembler::evminmaxph(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int imm8, int vector_len) {
assert(VM_Version::supports_avx10_2(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x52, (0xC0 | encode), imm8);
}

void Assembler::evminmaxph(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int imm8, int vector_len) {
assert(VM_Version::supports_avx10_2(), "");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}
attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_NObit);
vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x52);
emit_operand(dst, src, 0);
emit_int8(imm8);
}

void Assembler::evmaxph(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_avx512_fp16(), "requires AVX512-FP16");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "");
Expand Down
22 changes: 22 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,17 @@ class InstructionAttr;
// See fxsave and xsave(EVEX enabled) documentation for layout
const int FPUStateSizeInWords = 2688 / wordSize;


// AVX10 new minmax instruction control mask encoding.
//
// imm8[4] = 0 (please refer to Table 11.1 of section 11.2 of AVX10 manual[1] for details)
// imm8[3:2] (sign control) = 01 (select sign, please refer to Table 11.5 of section 11.2 of AVX10 manual[1] for details)
// imm8[1:0] = 00 (min) / 01 (max)
//
// [1] https://www.intel.com/content/www/us/en/content-details/856721/intel-advanced-vector-extensions-10-2-intel-avx10-2-architecture-specification.html?wapkw=AVX10
const int AVX10_MINMAX_MAX_COMPARE_SIGN = 0x5;
const int AVX10_MINMAX_MIN_COMPARE_SIGN = 0x4;

// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
// is what you get. The Assembler is generating code into a CodeBuffer.
Expand Down Expand Up @@ -2745,6 +2756,17 @@ class Assembler : public AbstractAssembler {
void minpd(XMMRegister dst, XMMRegister src);
void vminpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);

// AVX10.2 floating point minmax instructions
void eminmaxsh(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
void eminmaxss(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
void eminmaxsd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
void evminmaxph(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int imm8, int vector_len);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason evminmaxph does not have a version where src has type Address?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, we do not have a matcher pattern to consume it, as the MIN/MAX sequence was anyway, a bulky one. I have added a new pattern for memory operand flavor of the pattern specifically for AVX-10, along with this patch.

Patch has been regressed over the following tests using Intel SDE https://www.intel.com/content/www/us/en/download/684897/intel-software-development-emulator.html (Version 9.53).

  • test/jdk/jdk/incubator/vector/Double*VectorTests:: (min/max all variants including reduction)
  • test/hotspot/jtreg/compiler/vectorization/TestFloat16VectorOperations.java
  • test/hotspot/jtreg/compiler/c2/irTests/TestFloat16ScalarOperations.java

e.g. command line /home/jatinbha/softwares/sde-external-9.53.0-2025-03-16-lin/sde64 -future -ptr_raise -icount -- java

void evminmaxph(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int imm8, int vector_len);
void evminmaxps(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int imm8, int vector_len);
void evminmaxps(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int imm8, int vector_len);
void evminmaxpd(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int imm8, int vector_len);
void evminmaxpd(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int imm8, int vector_len);

// Maximum of packed integers
void pmaxsb(XMMRegister dst, XMMRegister src);
void vpmaxsb(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
Expand Down
43 changes: 39 additions & 4 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,21 @@ void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
}
}

void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line 1122 mentions the differences between vminps/vmaxps and Java semantics. Perhaps a mention of the new instructions introduced in this PR might help people who are confused about the fact that vminmax_fp is overloaded.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

XMMRegister src1, XMMRegister src2, int vlen_enc) {
assert(opc == Op_MinV || opc == Op_MinReductionV ||
opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");

int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
: AVX10_MINMAX_MAX_COMPARE_SIGN;
if (elem_bt == T_FLOAT) {
evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
} else {
assert(elem_bt == T_DOUBLE, "");
evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
}
}

// Float/Double signum
void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
Expand Down Expand Up @@ -2537,12 +2552,21 @@ void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_vali
} else { // i = [0,1]
vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
}
vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);

if (VM_Version::supports_avx10_2()) {
vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
} else {
vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
}
wsrc = wdst;
vlen_enc = Assembler::AVX_128bit;
}
if (is_dst_valid) {
vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
if (VM_Version::supports_avx10_2()) {
vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
} else {
vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
}
}
}

Expand All @@ -2568,12 +2592,23 @@ void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_val
assert(i == 0, "%d", i);
vpermilpd(wtmp, wsrc, 1, vlen_enc);
}
vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);

if (VM_Version::supports_avx10_2()) {
vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
} else {
vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
}

wsrc = wdst;
vlen_enc = Assembler::AVX_128bit;
}

if (is_dst_valid) {
vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
if (VM_Version::supports_avx10_2()) {
vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
} else {
vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
}
}
}

Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
int vlen_enc);

void vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
XMMRegister src1, XMMRegister src2, int vlen_enc);

void vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc);

void evminmax_fp(int opcode, BasicType elem_bt,
Expand Down
16 changes: 16 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8841,6 +8841,10 @@ void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XM
evpminsd(dst, mask, nds, src, merge, vector_len); break;
case T_LONG:
evpminsq(dst, mask, nds, src, merge, vector_len); break;
case T_FLOAT:
evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
case T_DOUBLE:
evminmaxpd(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
default:
fatal("Unexpected type argument %s", type2name(type)); break;
}
Expand All @@ -8856,6 +8860,10 @@ void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XM
evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
case T_LONG:
evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
case T_FLOAT:
evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
case T_DOUBLE:
evminmaxpd(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
default:
fatal("Unexpected type argument %s", type2name(type)); break;
}
Expand All @@ -8871,6 +8879,10 @@ void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XM
evpminsd(dst, mask, nds, src, merge, vector_len); break;
case T_LONG:
evpminsq(dst, mask, nds, src, merge, vector_len); break;
case T_FLOAT:
evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
case T_DOUBLE:
evminmaxpd(dst, mask, nds, src, merge, AVX10_MINMAX_MIN_COMPARE_SIGN, vector_len); break;
default:
fatal("Unexpected type argument %s", type2name(type)); break;
}
Expand All @@ -8886,6 +8898,10 @@ void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XM
evpmaxsd(dst, mask, nds, src, merge, vector_len); break;
case T_LONG:
evpmaxsq(dst, mask, nds, src, merge, vector_len); break;
case T_FLOAT:
evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
case T_DOUBLE:
evminmaxps(dst, mask, nds, src, merge, AVX10_MINMAX_MAX_COMPARE_SIGN, vector_len); break;
default:
fatal("Unexpected type argument %s", type2name(type)); break;
}
Expand Down
Loading