-
Notifications
You must be signed in to change notification settings - Fork 5.9k
8348868: AArch64: Add backend support for SelectFromTwoVector #23570
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -245,6 +245,18 @@ source %{ | |||||
return false; | ||||||
} | ||||||
break; | ||||||
// The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return | ||||||
// false if vector length > 16B but supported SVE version < 2. | ||||||
// For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else | ||||||
// generate Neon "tbl" instruction to select from two vectors. | ||||||
// Currently, as we support only vector sizes of 8B and 16B, we disable this operation for | ||||||
// T_LONG and T_DOUBLE on Neon as "mul" does not support 2D arrangement. However, these | ||||||
// types are supported on machines with UseSVE == 2. | ||||||
case Op_SelectFromTwoVector: | ||||||
if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) { | ||||||
return false; | ||||||
} | ||||||
break; | ||||||
default: | ||||||
break; | ||||||
} | ||||||
|
@@ -7150,3 +7162,38 @@ instruct vexpandBits(vReg dst, vReg src1, vReg src2) %{ | |||||
%} | ||||||
ins_pipe(pipe_slow); | ||||||
%} | ||||||
|
||||||
// --------------------------------SelectFromTwoVector ----------------------------- | ||||||
|
||||||
instruct vselect_from_two_vectors_SIFNeon(vReg dst, vReg_V17 src1, vReg_V18 src2, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have a similar rule for
Suggested change
|
||||||
vReg index, vReg tmp1, vReg tmp2) %{ | ||||||
predicate((Matcher::vector_element_basic_type(n) == T_SHORT || | ||||||
type2aelembytes(Matcher::vector_element_basic_type(n)) == 4) && | ||||||
(UseSVE < 2 || Matcher::vector_length_in_bytes(n) < 16)); | ||||||
match(Set dst (SelectFromTwoVector (Binary index src1) src2)); | ||||||
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); | ||||||
format %{ "vselect_from_two_vectors_SIF $dst, $src1, $src2, $index\t# vector (4S/8S/2I/4I/2F/4F). KILL $tmp1, $tmp2" %} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use the same match rule name in the format. Thanks! |
||||||
ins_encode %{ | ||||||
BasicType bt = Matcher::vector_element_basic_type(this); | ||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this); | ||||||
__ select_from_two_vectors_SIFNeon($dst$$FloatRegister, $src1$$FloatRegister, | ||||||
$src2$$FloatRegister,$index$$FloatRegister, | ||||||
$tmp1$$FloatRegister, $tmp2$$FloatRegister, | ||||||
bt, length_in_bytes); | ||||||
%} | ||||||
ins_pipe(pipe_slow); | ||||||
%} | ||||||
|
||||||
instruct vselect_from_two_vectors(vReg dst, vReg_V17 src1, vReg_V18 src2, vReg index) %{ | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add comment before the rule why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm still curious. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @theRealAph , apologies for the late response. The tbl instruction needs both the source registers to be consecutive and I could not find a way to make the register allocator choose two consecutive registers for this operation and decided to hard code them. As v0-v7 are used for function arguments, v8-v15 are non-volatile which are not needed for this purpose (as we dont want to be preserving these values across function calls), I chose two of the volatile registers from v16-v31 for the source registers. Please let me know if this is the right way to approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose it is, yes. Thanks. |
||||||
predicate(Matcher::vector_element_basic_type(n) == T_BYTE || | ||||||
(UseSVE == 2 && Matcher::vector_length_in_bytes(n) >= 16)); | ||||||
match(Set dst (SelectFromTwoVector (Binary index src1) src2)); | ||||||
format %{ "vselect_from_two_vectors $dst, $src1, $src2, $index" %} | ||||||
ins_encode %{ | ||||||
BasicType bt = Matcher::vector_element_basic_type(this); | ||||||
uint length_in_bytes = Matcher::vector_length_in_bytes(this); | ||||||
__ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, | ||||||
$index$$FloatRegister, bt, length_in_bytes); | ||||||
%} | ||||||
ins_pipe(pipe_slow); | ||||||
%} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -235,6 +235,18 @@ source %{ | |
return false; | ||
} | ||
break; | ||
// The "tbl" instruction for two vector table is supported only in Neon and SVE2. Return | ||
// false if vector length > 16B but supported SVE version < 2. | ||
// For vector length of 16B, generate SVE2 "tbl" instruction if SVE2 is supported, else | ||
// generate Neon "tbl" instruction to select from two vectors. | ||
// Currently, as we support only vector sizes of 8B and 16B, we disable this operation for | ||
// T_LONG and T_DOUBLE on Neon as "mul" does not support 2D arrangement. However, these | ||
// types are supported on machines with UseSVE == 2. | ||
case Op_SelectFromTwoVector: | ||
if (UseSVE < 2 && (type2aelembytes(bt) == 8 || length_in_bytes > 16)) { | ||
return false; | ||
} | ||
break; | ||
default: | ||
break; | ||
} | ||
|
@@ -5132,3 +5144,38 @@ BITPERM(vcompressBits, CompressBitsV, sve_bext) | |
|
||
// ----------------------------------- ExpandBitsV --------------------------------- | ||
BITPERM(vexpandBits, ExpandBitsV, sve_bdep) | ||
|
||
// --------------------------------SelectFromTwoVector ----------------------------- | ||
|
||
instruct vselect_from_two_vectors_SIFNeon(vReg dst, vReg_V17 src1, vReg_V18 src2, | ||
vReg index, vReg tmp1, vReg tmp2) %{ | ||
predicate((Matcher::vector_element_basic_type(n) == T_SHORT || | ||
type2aelembytes(Matcher::vector_element_basic_type(n)) == 4) && | ||
(UseSVE < 2 || Matcher::vector_length_in_bytes(n) < 16)); | ||
match(Set dst (SelectFromTwoVector (Binary index src1) src2)); | ||
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); | ||
format %{ "vselect_from_two_vectors_SIF $dst, $src1, $src2, $index\t# vector (4S/8S/2I/4I/2F/4F). KILL $tmp1, $tmp2" %} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Be careful here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @theRealAph Thanks for the suggestion! makes sense to add USE_KILL for the src1 usage here. I am getting into some errors when I do that. I'll resolve them and get back soon. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Maybe that should be USE_DEF or TEMP_DEF. |
||
ins_encode %{ | ||
BasicType bt = Matcher::vector_element_basic_type(this); | ||
uint length_in_bytes = Matcher::vector_length_in_bytes(this); | ||
__ select_from_two_vectors_SIFNeon($dst$$FloatRegister, $src1$$FloatRegister, | ||
$src2$$FloatRegister,$index$$FloatRegister, | ||
$tmp1$$FloatRegister, $tmp2$$FloatRegister, | ||
bt, length_in_bytes); | ||
%} | ||
ins_pipe(pipe_slow); | ||
%} | ||
|
||
instruct vselect_from_two_vectors(vReg dst, vReg_V17 src1, vReg_V18 src2, vReg index) %{ | ||
predicate(Matcher::vector_element_basic_type(n) == T_BYTE || | ||
(UseSVE == 2 && Matcher::vector_length_in_bytes(n) >= 16)); | ||
match(Set dst (SelectFromTwoVector (Binary index src1) src2)); | ||
format %{ "vselect_from_two_vectors $dst, $src1, $src2, $index" %} | ||
ins_encode %{ | ||
BasicType bt = Matcher::vector_element_basic_type(this); | ||
uint length_in_bytes = Matcher::vector_length_in_bytes(this); | ||
__ select_from_two_vectors($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, | ||
$index$$FloatRegister, bt, length_in_bytes); | ||
%} | ||
ins_pipe(pipe_slow); | ||
%} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2853,3 +2853,77 @@ void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) { | |
add(rfp, sp, framesize - 2 * wordSize); | ||
} | ||
} | ||
|
||
void C2_MacroAssembler::select_from_two_vectors_SIFNeon(FloatRegister dst, FloatRegister src1, | ||
FloatRegister src2, FloatRegister index, | ||
FloatRegister tmp1, FloatRegister tmp2, | ||
BasicType bt, unsigned vector_length_in_bytes) { | ||
assert_different_registers(src1, src2, tmp1, tmp2); | ||
assert(bt == T_SHORT || bt == T_INT || bt == T_FLOAT, "unsupported basic type"); | ||
assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported vector length"); | ||
|
||
// Neon "tbl" instruction only supports byte tables, so we need to look at chunks of | ||
// 2B for selecting shorts or chunks of 4B for selecting ints/floats from the table. | ||
// The index values in "index" register are in the range of [0, 2 * NUM_ELEM) where NUM_ELEM | ||
// is the number of elements that can fit in a vector. For ex. for T_SHORT with 64-bit vector length, | ||
// the indices can range from [0, 7]. | ||
// As an example with 64-bit vector length and T_SHORT type - let index = [2, 5, 1, 0] | ||
// Move a constant 0x02 in every byte of tmp1 - tmp1 = [0x0202, 0x0202, 0x0202, 0x0202] | ||
// Move a constant 0x0100 in every 2B of tmp2 - tmp2 = [0x0100, 0x0100, 0x0100, 0x0100] | ||
// Multiply index vector with tmp1 to yield - dst = [0x0404, 0x0b0b, 0x0202, 0x0000] | ||
// Add the multiplied result to the vector in tmp2 to obtain the byte level | ||
// offsets - dst = [0x0504, 0x0c0b, 0x0302, 0x0100] | ||
// Use these offsets in the "tbl" instruction to select chunks of 2B. | ||
|
||
SIMD_Arrangement size1 = vector_length_in_bytes == 16 ? T16B : T8B; | ||
SIMD_Arrangement size2 = vector_length_in_bytes == 16 ? T8H : T4H; | ||
if (bt == T_INT || bt == T_FLOAT) { | ||
size2 = vector_length_in_bytes == 16 ? T4S : T2S; | ||
} | ||
|
||
switch (bt) { | ||
case T_SHORT: | ||
mov(tmp1, size1, 0x02); | ||
mov(tmp2, size2, 0x0100); | ||
break; | ||
case T_INT: | ||
case T_FLOAT: | ||
// Similarly, for int/float the index values for the "tbl" instruction are computed to | ||
// select chunks of 4B for every int/float element | ||
mov(tmp1, size1, 0x04); | ||
mov(tmp2, size2, 0x03020100); | ||
break; | ||
default: | ||
ShouldNotReachHere(); | ||
} | ||
mulv(dst, size2, index, tmp1); | ||
Comment on lines
+2898
to
+2899
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use vector There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @XiaohongGong , thanks I'll give it a try and get back. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Bhavana-Kilambi , left shift can not get right indexes here as values There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @XiaohongGong , thanks but bsl instruction only has 8B/16B types. not D type. I'll see how I can do this with bsl. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, BTW, I'm currently working on adding the vector rearrange support for 2D (i.e. 128-bit long/double vector) types, and I met the same issues. I have tested that using a pattern with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @XiaohongGong thank you! I will check it out. Apologies for being so slow in responding (got pulled into something else). I will update this PR with my latest patch soon. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @XiaohongGong , I just got back to working on this PR again!
This is based on the fact that the index vector can only contain values = 0 to 3. If the first bit is 0/1 it refers to the first or second double/long and if the second bit is 0/1 it selects the source (either src1/src2). I am not able to avoid duplicating the source elements. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, I forgot that we have the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @XiaohongGong , thanks for the idea. I did check the codegen and I saw that the iota vectors were being loaded twice for both the source vectors which I felt could be eliminated. So I created a separate implementation for
I have rearranged the instructions and used This implementation is certainly better than my previous implementation by ~23% for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's fine to me. Thanks for your testing! Using the mid-end IR pattern looks better that it may have other mid-end optimization opportunities in some case. |
||
addv(dst, size1, dst, tmp2); // "dst" now contains the processed index elements | ||
// to select a set of bytes (2B/4B) depending on the datatype | ||
|
||
if (vector_length_in_bytes == 8) { | ||
// We need to fit both the source vectors (src1, src2) in a 128-bit register as the | ||
// Neon "tbl" instruction supports only looking up 16B vectors and use the Neon "tbl" | ||
// instruction with one vector lookup | ||
ins(src1, D, src2, 1, 0); | ||
tbl(dst, size1, src1, 1, dst); | ||
} else { | ||
// If the vector length is 16B, then use the Neon "tbl" instruction with two vector table | ||
assert(vector_length_in_bytes == 16, "must be"); | ||
tbl(dst, size1, src1, 2, dst); | ||
} | ||
} | ||
|
||
void C2_MacroAssembler::select_from_two_vectors(FloatRegister dst, FloatRegister src1, | ||
FloatRegister src2, FloatRegister index, | ||
BasicType bt, unsigned vector_length_in_bytes) { | ||
if (bt == T_BYTE && vector_length_in_bytes == 8) { | ||
ins(src1, D, src2, 1, 0); | ||
tbl(dst, T8B, src1, 1, index); | ||
} else if (bt == T_BYTE && vector_length_in_bytes == 16 && UseSVE < 2){ | ||
tbl(dst, T16B, src1, 2, index); | ||
} else { | ||
assert(UseSVE == 2, "must be sve2"); | ||
SIMD_RegVariant size = elemType_to_regVariant(bt); | ||
sve2_tbl(dst, size, src1, src2, index); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: use upper case