-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AArch64] Prefer using DUP instead of INS where possible #138549
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Csanád Hajdú (Il-Capitano) ChangesReplace all instances of
Patch is 31.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138549.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 3962c7eba5833..18d13676bb26d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7349,7 +7349,8 @@ def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))),
// Patterns for i8/i16 -> v2i32/v4i16 lane moves via insert and extract that go via i32.
multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType OutVT,
- Instruction INS, SDNodeXForm VecIndexMult> {
+ Instruction INS, Instruction DUP, SubRegIndex DUPSub,
+ SDNodeXForm VecIndexMult> {
// VT64->OutVT
def : Pat<(OutVT (vector_insert (OutVT V64:$src),
(i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
@@ -7360,8 +7361,10 @@ multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType O
dsub)>;
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))))),
(EXTRACT_SUBREG
- (INS (IMPLICIT_DEF), 0,
- (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
+ (VT128 (SUBREG_TO_REG
+ (i64 0),
+ (DUP (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
+ DUPSub)),
dsub)>;
// VT128->OutVT
@@ -7374,25 +7377,38 @@ multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType O
dsub)>;
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))))),
(EXTRACT_SUBREG
- (INS (IMPLICIT_DEF), 0, V128:$Rn, imm:$Immn),
+ (VT128 (SUBREG_TO_REG
+ (i64 0),
+ (DUP V128:$Rn, imm:$Immn),
+ DUPSub)),
dsub)>;
}
-defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v4i16, INSvi8lane, VecIndex_x2>;
-defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v2i32, INSvi8lane, VecIndex_x4>;
-defm : Neon_INS_elt_ext_pattern<v8i16, v4i16, v2i32, INSvi16lane, VecIndex_x2>;
+defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v4i16, INSvi8lane, DUPi8, bsub, VecIndex_x2>;
+defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v2i32, INSvi8lane, DUPi8, bsub, VecIndex_x4>;
+defm : Neon_INS_elt_ext_pattern<v8i16, v4i16, v2i32, INSvi16lane, DUPi16, hsub, VecIndex_x2>;
// bitcast of an extract
-// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
-def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
- (EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>;
+// f32 bitcast(vector_extract(v4i32 src, 0)) -> EXTRACT_SUBREG(src)
+def : Pat<(f32 (bitconvert (i32 (vector_extract v16i8:$src, (i64 0))))),
+ (EXTRACT_SUBREG V128:$src, bsub)>;
+def : Pat<(f32 (bitconvert (i32 (vector_extract v8i16:$src, (i64 0))))),
+ (EXTRACT_SUBREG V128:$src, hsub)>;
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, ssub)>;
-def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
- (EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>;
def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, (i64 0))))),
(EXTRACT_SUBREG V128:$src, dsub)>;
+// f32 bitcast(vector_extract(v4i32 src, lane)) -> DUPi32(src, lane)
+def : Pat<(f32 (bitconvert (i32 (vector_extract v16i8:$src, imm:$Immd)))),
+ (EXTRACT_SUBREG (v16i8 (SUBREG_TO_REG (i64 0), (DUPi8 V128:$src, imm:$Immd), bsub)), ssub)>;
+def : Pat<(f32 (bitconvert (i32 (vector_extract v8i16:$src, imm:$Immd)))),
+ (EXTRACT_SUBREG (v8i16 (SUBREG_TO_REG (i64 0), (DUPi16 V128:$src, imm:$Immd), hsub)), ssub)>;
+def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
+ (DUPi32 V128:$src, imm:$Immd)>;
+def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
+ (DUPi64 V128:$src, imm:$Immd)>;
+
// Floating point vector extractions are codegen'd as either a sequence of
// subregister extractions, or a MOV (aka DUP here) if
// the lane number is anything other than zero.
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bd394671881e8..6be6e1a4bdf97 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3453,16 +3453,10 @@ let Predicates = [HasSVE_or_SME] in {
// Alternative case where insertelement is just scalar_to_vector rather than vector_insert.
def : Pat<(v1f64 (scalar_to_vector
(f64 (vector_extract nxv2f64:$vec, VectorIndexD:$index)))),
- (EXTRACT_SUBREG
- (INSvi64lane (IMPLICIT_DEF), (i64 0),
- (EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index),
- dsub)>;
+ (DUPi64 (EXTRACT_SUBREG nxv2f64:$vec, zsub), VectorIndexD:$index)>;
def : Pat<(v1i64 (scalar_to_vector
(i64 (vector_extract nxv2i64:$vec, VectorIndexD:$index)))),
- (EXTRACT_SUBREG
- (INSvi64lane (IMPLICIT_DEF), (i64 0),
- (EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index),
- dsub)>;
+ (DUPi64 (EXTRACT_SUBREG nxv2i64:$vec, zsub), VectorIndexD:$index)>;
} // End HasNEON
let Predicates = [HasNEON] in {
diff --git a/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll b/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll
index e2d530ab421ef..07c4dbcf41096 100644
--- a/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll
@@ -193,7 +193,7 @@ define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
define i32 @uqxtn_ext(<4 x i32> noundef %a, <4 x i32> noundef %b, i32 %c, float %d, <2 x i64> %e) {
; CHECK-LABEL: uqxtn_ext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov v0.d[0], v3.d[1]
+; CHECK-NEXT: mov d0, v3.d[1]
; CHECK-NEXT: uqxtn s0, d0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -219,7 +219,7 @@ entry:
define <4 x i32> @sqxtun_insext(<4 x i32> noundef %a, <2 x i64> %e) {
; CHECK-LABEL: sqxtun_insext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov v1.d[0], v1.d[1]
+; CHECK-NEXT: mov d1, v1.d[1]
; CHECK-NEXT: sqxtun s1, d1
; CHECK-NEXT: mov v0.s[3], v1.s[0]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
index 33238ccf86a39..3133d0efb4b9b 100644
--- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll
@@ -70,8 +70,8 @@ define <4 x i64> @z_i32_v4i64(i32 %x) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: movi v1.2d, #0x000000000000ff
-; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
-; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
+; CHECK-SD-NEXT: mov b2, v0.b[0]
+; CHECK-SD-NEXT: mov b3, v0.b[2]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0
@@ -172,8 +172,8 @@ define <4 x i64> @s_i32_v4i64(i32 %x) {
; CHECK-SD-LABEL: s_i32_v4i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w0
-; CHECK-SD-NEXT: mov v1.b[0], v0.b[0]
-; CHECK-SD-NEXT: mov v2.b[0], v0.b[2]
+; CHECK-SD-NEXT: mov b1, v0.b[0]
+; CHECK-SD-NEXT: mov b2, v0.b[2]
; CHECK-SD-NEXT: mov v1.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[3]
; CHECK-SD-NEXT: ushll v0.2d, v1.2s, #0
diff --git a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
index e90b6cb7f809b..65da95e0163f4 100644
--- a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
+++ b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
@@ -5,7 +5,7 @@
define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECKLE-LABEL: test_reconstructshuffle:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: mov v2.b[0], v0.b[3]
+; CHECKLE-NEXT: mov b2, v0.b[3]
; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
@@ -21,7 +21,7 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT: mov v2.b[0], v0.b[3]
+; CHECKBE-NEXT: mov b2, v0.b[3]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
index 97c3a4937cda7..05422d3cc6051 100644
--- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
@@ -347,9 +347,8 @@ define half @get_lane_64(<4 x half> %a) #0 {
; CHECK-LABEL: get_lane_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: mov h0, v0.h[2]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x half> %a to <4 x i16>
@@ -362,9 +361,8 @@ entry:
define half @get_lane_128(<8 x half> %a) #0 {
; CHECK-LABEL: get_lane_128:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: mov h0, v0.h[2]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x half> %a to <8 x i16>
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index fb2bdb4d63f47..34858940370e9 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -3443,10 +3443,10 @@ define <8 x double> @stofp_v8i8_v8f64(<8 x i8> %a) {
; CHECK-SD-LABEL: stofp_v8i8_v8f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v1.b[0], v0.b[0]
-; CHECK-SD-NEXT: mov v2.b[0], v0.b[2]
-; CHECK-SD-NEXT: mov v3.b[0], v0.b[4]
-; CHECK-SD-NEXT: mov v4.b[0], v0.b[6]
+; CHECK-SD-NEXT: mov b1, v0.b[0]
+; CHECK-SD-NEXT: mov b2, v0.b[2]
+; CHECK-SD-NEXT: mov b3, v0.b[4]
+; CHECK-SD-NEXT: mov b4, v0.b[6]
; CHECK-SD-NEXT: mov v1.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[3]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[5]
@@ -3492,10 +3492,10 @@ define <8 x double> @utofp_v8i8_v8f64(<8 x i8> %a) {
; CHECK-SD-LABEL: utofp_v8i8_v8f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
-; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
-; CHECK-SD-NEXT: mov v4.b[0], v0.b[4]
-; CHECK-SD-NEXT: mov v5.b[0], v0.b[6]
+; CHECK-SD-NEXT: mov b2, v0.b[0]
+; CHECK-SD-NEXT: mov b3, v0.b[2]
+; CHECK-SD-NEXT: mov b4, v0.b[4]
+; CHECK-SD-NEXT: mov b5, v0.b[6]
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
@@ -3538,14 +3538,14 @@ define <16 x double> @stofp_v16i8_v16f64(<16 x i8> %a) {
; CHECK-SD-LABEL: stofp_v16i8_v16f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
-; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
-; CHECK-SD-NEXT: mov v4.b[0], v0.b[4]
-; CHECK-SD-NEXT: mov v5.b[0], v0.b[6]
-; CHECK-SD-NEXT: mov v6.b[0], v1.b[0]
-; CHECK-SD-NEXT: mov v7.b[0], v1.b[2]
-; CHECK-SD-NEXT: mov v16.b[0], v1.b[4]
-; CHECK-SD-NEXT: mov v17.b[0], v1.b[6]
+; CHECK-SD-NEXT: mov b2, v0.b[0]
+; CHECK-SD-NEXT: mov b3, v0.b[2]
+; CHECK-SD-NEXT: mov b4, v0.b[4]
+; CHECK-SD-NEXT: mov b5, v0.b[6]
+; CHECK-SD-NEXT: mov b6, v1.b[0]
+; CHECK-SD-NEXT: mov b7, v1.b[2]
+; CHECK-SD-NEXT: mov b16, v1.b[4]
+; CHECK-SD-NEXT: mov b17, v1.b[6]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
; CHECK-SD-NEXT: mov v4.b[4], v0.b[5]
@@ -3622,15 +3622,15 @@ define <16 x double> @utofp_v16i8_v16f64(<16 x i8> %a) {
; CHECK-SD-LABEL: utofp_v16i8_v16f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: mov v3.b[0], v0.b[0]
-; CHECK-SD-NEXT: mov v4.b[0], v0.b[2]
-; CHECK-SD-NEXT: mov v5.b[0], v0.b[4]
-; CHECK-SD-NEXT: mov v6.b[0], v0.b[6]
+; CHECK-SD-NEXT: mov b3, v0.b[0]
+; CHECK-SD-NEXT: mov b4, v0.b[2]
+; CHECK-SD-NEXT: mov b5, v0.b[4]
+; CHECK-SD-NEXT: mov b6, v0.b[6]
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
-; CHECK-SD-NEXT: mov v7.b[0], v2.b[0]
-; CHECK-SD-NEXT: mov v16.b[0], v2.b[2]
-; CHECK-SD-NEXT: mov v17.b[0], v2.b[4]
-; CHECK-SD-NEXT: mov v18.b[0], v2.b[6]
+; CHECK-SD-NEXT: mov b7, v2.b[0]
+; CHECK-SD-NEXT: mov b16, v2.b[2]
+; CHECK-SD-NEXT: mov b17, v2.b[4]
+; CHECK-SD-NEXT: mov b18, v2.b[6]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v4.b[4], v0.b[3]
; CHECK-SD-NEXT: mov v5.b[4], v0.b[5]
@@ -3699,18 +3699,18 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-LABEL: stofp_v32i8_v32f64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-SD-NEXT: mov v5.b[0], v1.b[6]
-; CHECK-SD-NEXT: mov v17.b[0], v1.b[4]
-; CHECK-SD-NEXT: mov v20.b[0], v1.b[2]
-; CHECK-SD-NEXT: mov v21.b[0], v1.b[0]
-; CHECK-SD-NEXT: mov v18.b[0], v0.b[0]
-; CHECK-SD-NEXT: mov v19.b[0], v0.b[6]
-; CHECK-SD-NEXT: mov v22.b[0], v0.b[4]
+; CHECK-SD-NEXT: mov b5, v1.b[6]
+; CHECK-SD-NEXT: mov b17, v1.b[4]
+; CHECK-SD-NEXT: mov b20, v1.b[2]
+; CHECK-SD-NEXT: mov b21, v1.b[0]
+; CHECK-SD-NEXT: mov b18, v0.b[0]
+; CHECK-SD-NEXT: mov b19, v0.b[6]
+; CHECK-SD-NEXT: mov b22, v0.b[4]
; CHECK-SD-NEXT: ext v16.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: mov v2.b[0], v3.b[0]
-; CHECK-SD-NEXT: mov v4.b[0], v3.b[2]
-; CHECK-SD-NEXT: mov v6.b[0], v3.b[4]
-; CHECK-SD-NEXT: mov v7.b[0], v3.b[6]
+; CHECK-SD-NEXT: mov b2, v3.b[0]
+; CHECK-SD-NEXT: mov b4, v3.b[2]
+; CHECK-SD-NEXT: mov b6, v3.b[4]
+; CHECK-SD-NEXT: mov b7, v3.b[6]
; CHECK-SD-NEXT: mov v5.b[4], v1.b[7]
; CHECK-SD-NEXT: mov v17.b[4], v1.b[5]
; CHECK-SD-NEXT: mov v20.b[4], v1.b[3]
@@ -3718,16 +3718,16 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-NEXT: mov v19.b[4], v0.b[7]
; CHECK-SD-NEXT: mov v22.b[4], v0.b[5]
; CHECK-SD-NEXT: mov v18.b[4], v0.b[1]
-; CHECK-SD-NEXT: mov v23.b[0], v16.b[0]
+; CHECK-SD-NEXT: mov b23, v16.b[0]
; CHECK-SD-NEXT: mov v2.b[4], v3.b[1]
; CHECK-SD-NEXT: mov v4.b[4], v3.b[3]
; CHECK-SD-NEXT: mov v6.b[4], v3.b[5]
; CHECK-SD-NEXT: mov v7.b[4], v3.b[7]
-; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
+; CHECK-SD-NEXT: mov b3, v0.b[2]
; CHECK-SD-NEXT: shl v5.2s, v5.2s, #24
; CHECK-SD-NEXT: shl v17.2s, v17.2s, #24
; CHECK-SD-NEXT: shl v20.2s, v20.2s, #24
-; CHECK-SD-NEXT: mov v24.b[0], v16.b[4]
+; CHECK-SD-NEXT: mov b24, v16.b[4]
; CHECK-SD-NEXT: mov v23.b[4], v16.b[1]
; CHECK-SD-NEXT: shl v18.2s, v18.2s, #24
; CHECK-SD-NEXT: shl v19.2s, v19.2s, #24
@@ -3739,10 +3739,10 @@ define <32 x double> @stofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-NEXT: shl v0.2s, v21.2s, #24
; CHECK-SD-NEXT: shl v4.2s, v6.2s, #24
; CHECK-SD-NEXT: shl v6.2s, v7.2s, #24
-; CHECK-SD-NEXT: mov v7.b[0], v16.b[2]
+; CHECK-SD-NEXT: mov b7, v16.b[2]
; CHECK-SD-NEXT: sshll v5.2d, v5.2s, #0
; CHECK-SD-NEXT: sshr v20.2s, v20.2s, #24
-; CHECK-SD-NEXT: mov v21.b[0], v16.b[6]
+; CHECK-SD-NEXT: mov b21, v16.b[6]
; CHECK-SD-NEXT: sshll v17.2d, v17.2s, #0
; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-SD-NEXT: shl v22.2s, v22.2s, #24
@@ -3869,25 +3869,25 @@ entry:
define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-LABEL: utofp_v32i8_v32f64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov v6.b[0], v1.b[6]
-; CHECK-SD-NEXT: mov v7.b[0], v1.b[4]
+; CHECK-SD-NEXT: mov b6, v1.b[6]
+; CHECK-SD-NEXT: mov b7, v1.b[4]
; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-SD-NEXT: mov v16.b[0], v1.b[2]
-; CHECK-SD-NEXT: mov v17.b[0], v1.b[0]
-; CHECK-SD-NEXT: mov v19.b[0], v0.b[6]
-; CHECK-SD-NEXT: mov v20.b[0], v0.b[4]
+; CHECK-SD-NEXT: mov b16, v1.b[2]
+; CHECK-SD-NEXT: mov b17, v1.b[0]
+; CHECK-SD-NEXT: mov b19, v0.b[6]
+; CHECK-SD-NEXT: mov b20, v0.b[4]
; CHECK-SD-NEXT: movi d5, #0x0000ff000000ff
-; CHECK-SD-NEXT: mov v24.b[0], v0.b[2]
-; CHECK-SD-NEXT: mov v25.b[0], v0.b[0]
+; CHECK-SD-NEXT: mov b24, v0.b[2]
+; CHECK-SD-NEXT: mov b25, v0.b[0]
; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8
; CHECK-SD-NEXT: mov v6.b[4], v1.b[7]
; CHECK-SD-NEXT: mov v7.b[4], v1.b[5]
-; CHECK-SD-NEXT: mov v18.b[0], v3.b[0]
-; CHECK-SD-NEXT: mov v21.b[0], v3.b[2]
-; CHECK-SD-NEXT: mov v23.b[0], v3.b[4]
+; CHECK-SD-NEXT: mov b18, v3.b[0]
+; CHECK-SD-NEXT: mov b21, v3.b[2]
+; CHECK-SD-NEXT: mov b23, v3.b[4]
; CHECK-SD-NEXT: mov v16.b[4], v1.b[3]
; CHECK-SD-NEXT: mov v17.b[4], v1.b[1]
-; CHECK-SD-NEXT: mov v1.b[0], v3.b[6]
+; CHECK-SD-NEXT: mov b1, v3.b[6]
; CHECK-SD-NEXT: mov v19.b[4], v0.b[7]
; CHECK-SD-NEXT: mov v20.b[4], v0.b[5]
; CHECK-SD-NEXT: mov v24.b[4], v0.b[3]
@@ -3905,15 +3905,15 @@ define <32 x double> @utofp_v32i8_v32f64(<32 x i8> %a) {
; CHECK-SD-NEXT: ushll v7.2d, v7.2s, #0
; CHECK-SD-NEXT: and v20.8b, v20.8b, v5.8b
; CHECK-SD-NEXT: ushll v16.2d, v16.2s, #0
-; CHECK-SD-NEXT: mov v4.b[0], v2.b[0]
-; CHECK-SD-NEXT: mov v22.b[0], v2.b[2]
+; CHECK-SD-NEXT: mov b4, v2.b[0]
+; CHECK-SD-NEXT: mov b22, v2.b[2]
; CHECK-SD-NEXT: ushll v17.2d, v17.2s, #0
; CHECK-SD-NEXT: ushll v0.2d, v3.2s, #0
-; CHECK-SD-NEXT: mov v19.b[0], v2.b[4]
+; CHECK-SD-NEXT: mov b19, v2.b[4]
; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d
; CHECK-SD-NEXT: ucvtf v3.2d, v7.2d
; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0
-; CHECK-SD-NEXT: mov v7.b[0], v2.b[6]
+; CHECK-SD-NEXT: mov b7, v2.b[6]
; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d
; CHECK-SD-NEXT: and v24.8b, v24.8b, v5.8b
; CHECK-SD-NEXT: ucvtf v17.2d, v17.2d
diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
index c039da26b7c15..c6aa8701e1721 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
@@ -555,7 +555,7 @@ define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) {
; CHECK-LE-LABEL: bitcast_i16_to_v2i8:
; CHECK-LE: // %bb.0:
; CHECK-LE-NEXT: fmov s1, w0
-; CHECK-LE-NEXT: mov v0.b[0], v1.b[0]
+; CHECK-LE-NEXT: mov b0, v1.b[0]
; CHECK-LE-NEXT: mov v0.b[4], v1.b[1]
; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-LE-NEXT: ret
@@ -564,7 +564,7 @@ define <2 x i8> @bitcast_i16_to_v2i8(i16 %word) {
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: fmov s0, w0
; CHECK-BE-NEXT: rev16 v0.16b, v0.16b
-; CHECK-BE-NEXT: mov v1.b[0], v0.b[0]
+; CHECK-BE-NEXT: mov b1, v0.b[0]
; CHECK-BE-NEXT: mov v1.b[4], v0.b[1]
; CHECK-BE-NEXT: rev64 v0.2s, v1.2s
; CHECK-BE-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll
index 0f4eec4fdfda1..bfdf794c1c27a 100644
--- a/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll
+++ b/llvm/test/CodeGen/AArch64/neon-insert-sve-elt.ll
@@ -360,8 +360,7 @@ define <4 x i32> @test_q_lane4_nxv4i32(<4 x i32> %a, <vscale x 4 x i32> %b) {
define <1 x double> @test_lane0_nxv2f64(<1 x double> %a, <vscale x 2 x double> %b) {
; CHECK-LABEL: test_lane0_nxv2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov v0.d[0], v1.d[0]
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: mov d0, v1.d[0]
; CHECK-NEXT: ret
%c = extractelement <vscale ...
[truncated]
|
Replace all instances of `INS(IMPLICIT_DEF, 0, v, idx)` with `DUP(v, idx)` in instruction selection. `INS` (e.g. `mov v0.s[0], v1.s[1]`) has a value dependency on its output register, which becomes a false dependency when we're inserting into an `IMPLICIT_DEF` register. We can break this false dependency by using `DUP` (e.g. `mov s0, v1.s[1]`) instead.
6e0ba12
to
78b2efb
Compare
Gentle ping. Buildkite failed for some reason with "All tests passed but another part of the build failed." I rebased the patch to trigger a new build. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Replace all instances of `INS(IMPLICIT_DEF, 0, v, idx)` with `DUP(v, idx)` in instruction selection. `INS` (e.g. `mov v0.s[0], v1.s[1]`) has a value dependency on its output register, which becomes a false dependency when we're inserting into an `IMPLICIT_DEF` register. We can break this false dependency by using `DUP` (e.g. `mov s0, v1.s[1]`) instead.
Replace all instances of
INS(IMPLICIT_DEF, 0, v, idx)
withDUP(v, idx)
in instruction selection.INS
(e.g.mov v0.s[0], v1.s[1]
) has a value dependency on its output register, which becomes a false dependency when we're inserting into anIMPLICIT_DEF
register. We can break this false dependency by usingDUP
(e.g.mov s0, v1.s[1]
) instead.