Skip to content

Commit fbd9a31

Browse files
authored
[AArch64][SVE] Combine UXT[BHW] intrinsics to AND. (#137956)
This patch combines uxt[bhw] intrinsics to and_u when the governing predicate is all-true or the passthrough is undef (e.g. in cases of ``unknown'' merging). This improves code gen as the latter can be emitted as AND immediate instructions. For example, given: ```cpp svuint64_t foo(svuint64_t x) { return svextb_z(svptrue_b64(), x); } ``` Currently: ```gas foo: ptrue p0.d movi v1.2d, #0000000000000000 uxtb z0.d, p0/m, z0.d ret ``` Becomes: ```gas foo: and z0.d, z0.d, #0xff ret ```
1 parent fd161cf commit fbd9a31

File tree

2 files changed

+121
-0
lines changed

2 files changed

+121
-0
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+26
Original file line numberDiff line numberDiff line change
@@ -2702,6 +2702,26 @@ static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
27022702
return std::nullopt;
27032703
}
27042704

2705+
static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2706+
IntrinsicInst &II,
2707+
unsigned NumBits) {
2708+
Value *Passthru = II.getOperand(0);
2709+
Value *Pg = II.getOperand(1);
2710+
Value *Op = II.getOperand(2);
2711+
2712+
// Convert UXT[BHW] to AND.
2713+
if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
2714+
auto *Ty = cast<VectorType>(II.getType());
2715+
auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
2716+
auto *Mask = ConstantInt::get(Ty, MaskValue);
2717+
auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
2718+
{Pg, Op, Mask});
2719+
return IC.replaceInstUsesWith(II, And);
2720+
}
2721+
2722+
return std::nullopt;
2723+
}
2724+
27052725
std::optional<Instruction *>
27062726
AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
27072727
IntrinsicInst &II) const {
@@ -2801,6 +2821,12 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
28012821
return instCombineSVEInsr(IC, II);
28022822
case Intrinsic::aarch64_sve_ptrue:
28032823
return instCombinePTrue(IC, II);
2824+
case Intrinsic::aarch64_sve_uxtb:
2825+
return instCombineSVEUxt(IC, II, 8);
2826+
case Intrinsic::aarch64_sve_uxth:
2827+
return instCombineSVEUxt(IC, II, 16);
2828+
case Intrinsic::aarch64_sve_uxtw:
2829+
return instCombineSVEUxt(IC, II, 32);
28042830
}
28052831

28062832
return std::nullopt;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -passes=instcombine < %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
; Test that we combine uxtb to and_u for all-active predicates.
7+
8+
define <vscale x 2 x i64> @uxtb_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
9+
; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_m_64(
10+
; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
11+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 255))
12+
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP3]]
13+
;
14+
%3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
15+
ret <vscale x 2 x i64> %3
16+
}
17+
18+
; Test that we combine uxtb to and_u for undef (``unknown'') passthrough.
19+
20+
define <vscale x 2 x i64> @uxtb_x_64(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1) #0 {
21+
; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_x_64(
22+
; CHECK-SAME: <vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
23+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> splat (i64 255))
24+
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP3]]
25+
;
26+
%3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %0, <vscale x 2 x i64> %1)
27+
ret <vscale x 2 x i64> %3
28+
}
29+
30+
; Negative test - ensure we don't combine non-undef, no-all-active predicates.
31+
32+
define <vscale x 2 x i64> @uxtb_m_64_no_ptrue(<vscale x 2 x i1> %0, <vscale x 2 x i64> %1, <vscale x 2 x i64> %2) #0 {
33+
; CHECK-LABEL: define <vscale x 2 x i64> @uxtb_m_64_no_ptrue(
34+
; CHECK-SAME: <vscale x 2 x i1> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]], <vscale x 2 x i64> [[TMP2:%.*]]) #[[ATTR0]] {
35+
; CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], <vscale x 2 x i64> [[TMP1]])
36+
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP4]]
37+
;
38+
%4 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> %2, <vscale x 2 x i1> %0, <vscale x 2 x i64> %1)
39+
ret <vscale x 2 x i64> %4
40+
}
41+
42+
; For the remaining uxt* intrinsics and types, test that we combine them to the
43+
; appropriate and_u variant with a suitable mask.
44+
45+
define <vscale x 4 x i32> @uxtb_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
46+
; CHECK-LABEL: define <vscale x 4 x i32> @uxtb_m_32(
47+
; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
48+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> splat (i32 255))
49+
; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP3]]
50+
;
51+
%3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
52+
ret <vscale x 4 x i32> %3
53+
}
54+
55+
define <vscale x 8 x i16> @uxtb_m_16(<vscale x 8 x i16> %0, <vscale x 8 x i16> %1) #0 {
56+
; CHECK-LABEL: define <vscale x 8 x i16> @uxtb_m_16(
57+
; CHECK-SAME: <vscale x 8 x i16> [[TMP0:%.*]], <vscale x 8 x i16> [[TMP1:%.*]]) #[[ATTR0]] {
58+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.aarch64.sve.and.u.nxv8i16(<vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> splat (i16 255))
59+
; CHECK-NEXT: ret <vscale x 8 x i16> [[TMP3]]
60+
;
61+
%3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> %1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> %0)
62+
ret <vscale x 8 x i16> %3
63+
}
64+
65+
define <vscale x 2 x i64> @uxth_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
66+
; CHECK-LABEL: define <vscale x 2 x i64> @uxth_m_64(
67+
; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
68+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 65535))
69+
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP3]]
70+
;
71+
%3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
72+
ret <vscale x 2 x i64> %3
73+
}
74+
75+
define <vscale x 4 x i32> @uxth_m_32(<vscale x 4 x i32> %0, <vscale x 4 x i32> %1) #0 {
76+
; CHECK-LABEL: define <vscale x 4 x i32> @uxth_m_32(
77+
; CHECK-SAME: <vscale x 4 x i32> [[TMP0:%.*]], <vscale x 4 x i32> [[TMP1:%.*]]) #[[ATTR0]] {
78+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.and.u.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> splat (i32 65535))
79+
; CHECK-NEXT: ret <vscale x 4 x i32> [[TMP3]]
80+
;
81+
%3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> %1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> %0)
82+
ret <vscale x 4 x i32> %3
83+
}
84+
85+
define <vscale x 2 x i64> @uxtw_m_64(<vscale x 2 x i64> %0, <vscale x 2 x i64> %1) #0 {
86+
; CHECK-LABEL: define <vscale x 2 x i64> @uxtw_m_64(
87+
; CHECK-SAME: <vscale x 2 x i64> [[TMP0:%.*]], <vscale x 2 x i64> [[TMP1:%.*]]) #[[ATTR0]] {
88+
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.and.u.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> splat (i64 4294967295))
89+
; CHECK-NEXT: ret <vscale x 2 x i64> [[TMP3]]
90+
;
91+
%3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> %1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> %0)
92+
ret <vscale x 2 x i64> %3
93+
}
94+
95+
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)