Skip to content

Commit 2ba8a06

Browse files
Bhavana KilambiJatin Bhateja
andcommitted
8348868: AArch64: Add backend support for SelectFromTwoVector
Co-authored-by: Jatin Bhateja <[email protected]> Reviewed-by: haosun, aph, sviswanathan, xgong
1 parent 8ac4a88 commit 2ba8a06

File tree

13 files changed

+973
-26
lines changed

13 files changed

+973
-26
lines changed

src/hotspot/cpu/aarch64/aarch64.ad

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,6 +881,46 @@ reg_class vectorx_reg(
881881
V31, V31_H, V31_J, V31_K
882882
);
883883

884+
// Class for vector register V10
885+
reg_class v10_veca_reg(
886+
V10, V10_H, V10_J, V10_K
887+
);
888+
889+
// Class for vector register V11
890+
reg_class v11_veca_reg(
891+
V11, V11_H, V11_J, V11_K
892+
);
893+
894+
// Class for vector register V12
895+
reg_class v12_veca_reg(
896+
V12, V12_H, V12_J, V12_K
897+
);
898+
899+
// Class for vector register V13
900+
reg_class v13_veca_reg(
901+
V13, V13_H, V13_J, V13_K
902+
);
903+
904+
// Class for vector register V17
905+
reg_class v17_veca_reg(
906+
V17, V17_H, V17_J, V17_K
907+
);
908+
909+
// Class for vector register V18
910+
reg_class v18_veca_reg(
911+
V18, V18_H, V18_J, V18_K
912+
);
913+
914+
// Class for vector register V23
915+
reg_class v23_veca_reg(
916+
V23, V23_H, V23_J, V23_K
917+
);
918+
919+
// Class for vector register V24
920+
reg_class v24_veca_reg(
921+
V24, V24_H, V24_J, V24_K
922+
);
923+
884924
// Class for 128 bit register v0
885925
reg_class v0_reg(
886926
V0, V0_H
@@ -4969,6 +5009,86 @@ operand vReg()
49695009
interface(REG_INTER);
49705010
%}
49715011

5012+
operand vReg_V10()
5013+
%{
5014+
constraint(ALLOC_IN_RC(v10_veca_reg));
5015+
match(vReg);
5016+
5017+
op_cost(0);
5018+
format %{ %}
5019+
interface(REG_INTER);
5020+
%}
5021+
5022+
operand vReg_V11()
5023+
%{
5024+
constraint(ALLOC_IN_RC(v11_veca_reg));
5025+
match(vReg);
5026+
5027+
op_cost(0);
5028+
format %{ %}
5029+
interface(REG_INTER);
5030+
%}
5031+
5032+
operand vReg_V12()
5033+
%{
5034+
constraint(ALLOC_IN_RC(v12_veca_reg));
5035+
match(vReg);
5036+
5037+
op_cost(0);
5038+
format %{ %}
5039+
interface(REG_INTER);
5040+
%}
5041+
5042+
operand vReg_V13()
5043+
%{
5044+
constraint(ALLOC_IN_RC(v13_veca_reg));
5045+
match(vReg);
5046+
5047+
op_cost(0);
5048+
format %{ %}
5049+
interface(REG_INTER);
5050+
%}
5051+
5052+
operand vReg_V17()
5053+
%{
5054+
constraint(ALLOC_IN_RC(v17_veca_reg));
5055+
match(vReg);
5056+
5057+
op_cost(0);
5058+
format %{ %}
5059+
interface(REG_INTER);
5060+
%}
5061+
5062+
operand vReg_V18()
5063+
%{
5064+
constraint(ALLOC_IN_RC(v18_veca_reg));
5065+
match(vReg);
5066+
5067+
op_cost(0);
5068+
format %{ %}
5069+
interface(REG_INTER);
5070+
%}
5071+
5072+
operand vReg_V23()
5073+
%{
5074+
constraint(ALLOC_IN_RC(v23_veca_reg));
5075+
match(vReg);
5076+
5077+
op_cost(0);
5078+
format %{ %}
5079+
interface(REG_INTER);
5080+
%}
5081+
5082+
operand vReg_V24()
5083+
%{
5084+
constraint(ALLOC_IN_RC(v24_veca_reg));
5085+
match(vReg);
5086+
5087+
op_cost(0);
5088+
format %{ %}
5089+
interface(REG_INTER);
5090+
%}
5091+
49725092
operand vecA()
49735093
%{
49745094
constraint(ALLOC_IN_RC(vectora_reg));

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,28 @@ source %{
257257
return false;
258258
}
259259
break;
260+
case Op_SelectFromTwoVector:
261+
// The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return
262+
// false if vector length > 16B but supported SVE version < 2.
263+
// For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else
264+
// generate Neon "tbl" instruction to select from two vectors.
265+
// This operation is disabled for doubles and longs on machines with SVE < 2 and instead
266+
// the default VectorRearrange + VectorBlend is generated because the performance of the default
267+
// implementation was better than or equal to the implementation for SelectFromTwoVector.
268+
if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) {
269+
return false;
270+
}
271+
272+
// Because the SVE2 "tbl" instruction is unpredicated and partial operations cannot be generated
273+
// using masks, we disable this operation on machines where length_in_bytes < MaxVectorSize
274+
// on that machine with the only exception of 8B vector length. This is because at the time of
275+
// writing this, there is no SVE2 machine available with length_in_bytes > 8 and
276+
// length_in_bytes < MaxVectorSize to test this operation on (for example - there isn't an
277+
// SVE2 machine available with MaxVectorSize = 32 to test a case with length_in_bytes = 16).
278+
if (UseSVE == 2 && length_in_bytes > 8 && length_in_bytes < MaxVectorSize) {
279+
return false;
280+
}
281+
break;
260282
default:
261283
break;
262284
}
@@ -7172,3 +7194,71 @@ instruct vexpandBits(vReg dst, vReg src1, vReg src2) %{
71727194
%}
71737195
ins_pipe(pipe_slow);
71747196
%}
7197+
7198+
// ------------------------------------- SelectFromTwoVector ------------------------------------
7199+
// The Neon and SVE2 tbl instruction for two vector lookup requires both the source vectors to be
7200+
// consecutive. The match rules for SelectFromTwoVector reserve two consecutive vector registers
7201+
// for src1 and src2.
7202+
// Four combinations of vector registers for vselect_from_two_vectors are chosen at random
7203+
// (two from volatile and two from non-volatile set) which gives more freedom to the register
7204+
// allocator to choose the best pair of source registers at that point.
7205+
7206+
instruct vselect_from_two_vectors_10_11(vReg dst, vReg_V10 src1, vReg_V11 src2,
7207+
vReg index, vReg tmp) %{
7208+
effect(TEMP_DEF dst, TEMP tmp);
7209+
match(Set dst (SelectFromTwoVector (Binary index src1) src2));
7210+
format %{ "vselect_from_two_vectors_10_11 $dst, $src1, $src2, $index\t# KILL $tmp" %}
7211+
ins_encode %{
7212+
BasicType bt = Matcher::vector_element_basic_type(this);
7213+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
7214+
__ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
7215+
$src2$$FloatRegister, $index$$FloatRegister,
7216+
$tmp$$FloatRegister, bt, length_in_bytes);
7217+
%}
7218+
ins_pipe(pipe_slow);
7219+
%}
7220+
7221+
instruct vselect_from_two_vectors_12_13(vReg dst, vReg_V12 src1, vReg_V13 src2,
7222+
vReg index, vReg tmp) %{
7223+
effect(TEMP_DEF dst, TEMP tmp);
7224+
match(Set dst (SelectFromTwoVector (Binary index src1) src2));
7225+
format %{ "vselect_from_two_vectors_12_13 $dst, $src1, $src2, $index\t# KILL $tmp" %}
7226+
ins_encode %{
7227+
BasicType bt = Matcher::vector_element_basic_type(this);
7228+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
7229+
__ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
7230+
$src2$$FloatRegister, $index$$FloatRegister,
7231+
$tmp$$FloatRegister, bt, length_in_bytes);
7232+
%}
7233+
ins_pipe(pipe_slow);
7234+
%}
7235+
7236+
instruct vselect_from_two_vectors_17_18(vReg dst, vReg_V17 src1, vReg_V18 src2,
7237+
vReg index, vReg tmp) %{
7238+
effect(TEMP_DEF dst, TEMP tmp);
7239+
match(Set dst (SelectFromTwoVector (Binary index src1) src2));
7240+
format %{ "vselect_from_two_vectors_17_18 $dst, $src1, $src2, $index\t# KILL $tmp" %}
7241+
ins_encode %{
7242+
BasicType bt = Matcher::vector_element_basic_type(this);
7243+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
7244+
__ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
7245+
$src2$$FloatRegister, $index$$FloatRegister,
7246+
$tmp$$FloatRegister, bt, length_in_bytes);
7247+
%}
7248+
ins_pipe(pipe_slow);
7249+
%}
7250+
7251+
instruct vselect_from_two_vectors_23_24(vReg dst, vReg_V23 src1, vReg_V24 src2,
7252+
vReg index, vReg tmp) %{
7253+
effect(TEMP_DEF dst, TEMP tmp);
7254+
match(Set dst (SelectFromTwoVector (Binary index src1) src2));
7255+
format %{ "vselect_from_two_vectors_23_24 $dst, $src1, $src2, $index\t# KILL $tmp" %}
7256+
ins_encode %{
7257+
BasicType bt = Matcher::vector_element_basic_type(this);
7258+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
7259+
__ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
7260+
$src2$$FloatRegister, $index$$FloatRegister,
7261+
$tmp$$FloatRegister, bt, length_in_bytes);
7262+
%}
7263+
ins_pipe(pipe_slow);
7264+
%}

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,28 @@ source %{
247247
return false;
248248
}
249249
break;
250+
case Op_SelectFromTwoVector:
251+
// The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return
252+
// false if vector length > 16B but supported SVE version < 2.
253+
// For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else
254+
// generate Neon "tbl" instruction to select from two vectors.
255+
// This operation is disabled for doubles and longs on machines with SVE < 2 and instead
256+
// the default VectorRearrange + VectorBlend is generated because the performance of the default
257+
// implementation was better than or equal to the implementation for SelectFromTwoVector.
258+
if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) {
259+
return false;
260+
}
261+
262+
// Because the SVE2 "tbl" instruction is unpredicated and partial operations cannot be generated
263+
// using masks, we disable this operation on machines where length_in_bytes < MaxVectorSize
264+
// on that machine with the only exception of 8B vector length. This is because at the time of
265+
// writing this, there is no SVE2 machine available with length_in_bytes > 8 and
266+
// length_in_bytes < MaxVectorSize to test this operation on (for example - there isn't an
267+
// SVE2 machine available with MaxVectorSize = 32 to test a case with length_in_bytes = 16).
268+
if (UseSVE == 2 && length_in_bytes > 8 && length_in_bytes < MaxVectorSize) {
269+
return false;
270+
}
271+
break;
250272
default:
251273
break;
252274
}
@@ -5154,3 +5176,34 @@ BITPERM(vcompressBits, CompressBitsV, sve_bext)
51545176

51555177
// ----------------------------------- ExpandBitsV ---------------------------------
51565178
BITPERM(vexpandBits, ExpandBitsV, sve_bdep)
5179+
5180+
// ------------------------------------- SelectFromTwoVector ------------------------------------
5181+
// The Neon and SVE2 tbl instruction for two vector lookup requires both the source vectors to be
5182+
// consecutive. The match rules for SelectFromTwoVector reserve two consecutive vector registers
5183+
// for src1 and src2.
5184+
// Four combinations of vector registers for vselect_from_two_vectors are chosen at random
5185+
// (two from volatile and two from non-volatile set) which gives more freedom to the register
5186+
// allocator to choose the best pair of source registers at that point.
5187+
dnl
5188+
dnl SELECT_FROM_TWO_VECTORS($1, $2 )
5189+
dnl SELECT_FROM_TWO_VECTORS(first_reg, second_reg)
5190+
define(`SELECT_FROM_TWO_VECTORS', `
5191+
instruct vselect_from_two_vectors_$1_$2(vReg dst, vReg_V$1 src1, vReg_V$2 src2,
5192+
vReg index, vReg tmp) %{
5193+
effect(TEMP_DEF dst, TEMP tmp);
5194+
match(Set dst (SelectFromTwoVector (Binary index src1) src2));
5195+
format %{ "vselect_from_two_vectors_$1_$2 $dst, $src1, $src2, $index\t# KILL $tmp" %}
5196+
ins_encode %{
5197+
BasicType bt = Matcher::vector_element_basic_type(this);
5198+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
5199+
__ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister,
5200+
$src2$$FloatRegister, $index$$FloatRegister,
5201+
$tmp$$FloatRegister, bt, length_in_bytes);
5202+
%}
5203+
ins_pipe(pipe_slow);
5204+
%}')dnl
5205+
dnl
5206+
SELECT_FROM_TWO_VECTORS(10, 11)
5207+
SELECT_FROM_TWO_VECTORS(12, 13)
5208+
SELECT_FROM_TWO_VECTORS(17, 18)
5209+
SELECT_FROM_TWO_VECTORS(23, 24)

src/hotspot/cpu/aarch64/assembler_aarch64.hpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4231,12 +4231,29 @@ template<typename R, typename... Rx>
42314231
sf(imm1, 9, 5), rf(Zd, 0);
42324232
}
42334233

4234-
// SVE programmable table lookup/permute using vector of element indices
4235-
void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) {
4234+
private:
4235+
void _sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, unsigned reg_count, FloatRegister Zm) {
42364236
starti;
42374237
assert(T != Q, "invalid size");
4238+
// Only supports one or two vector lookup. One vector lookup was introduced in SVE1
4239+
// and two vector lookup in SVE2
4240+
assert(0 < reg_count && reg_count <= 2, "invalid number of registers");
4241+
4242+
int op11 = (reg_count == 1) ? 0b10 : 0b01;
4243+
42384244
f(0b00000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16);
4239-
f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
4245+
f(0b001, 15, 13), f(op11, 12, 11), f(0b0, 10), rf(Zn, 5), rf(Zd, 0);
4246+
}
4247+
4248+
public:
4249+
// SVE/SVE2 Programmable table lookup in one or two vector table (zeroing)
4250+
void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) {
4251+
_sve_tbl(Zd, T, Zn, 1, Zm);
4252+
}
4253+
4254+
void sve_tbl(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn1, FloatRegister Zn2, FloatRegister Zm) {
4255+
assert(Zn1->successor() == Zn2, "invalid order of registers");
4256+
_sve_tbl(Zd, T, Zn1, 2, Zm);
42404257
}
42414258

42424259
// Shuffle active elements of vector to the right and fill with zero

0 commit comments

Comments
 (0)