Skip to content

Commit 7f810c2

Browse files
committed
[VectorCombine] Fix the type used in foldShuffleOfIntrinsics Cost.
The shuffle needn't be twice the original number of vector elements, so the intermediate type used between the shuffle and the intrinsic should use the ShuffleDstTy number of elements. I found this when looking at shuffle costs and do not have test where it alters the output, but have added some cases where the shuffle output is not twice the size of the input.
1 parent 98e31b7 commit 7f810c2

File tree

2 files changed

+36
-1
lines changed

2 files changed

+36
-1
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2377,7 +2377,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
23772377
} else {
23782378
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
23792379
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
2380-
VecTy->getNumElements() * 2));
2380+
ShuffleDstTy->getNumElements()));
23812381
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
23822382
VecTy, OldMask, CostKind);
23832383
}

llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll

+35
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,20 @@ entry:
6969
ret <8 x i1> %4
7070
}
7171

72+
define <2 x i1> @test4b(<4 x float> %0, <4 x float> %1) {
73+
; CHECK-LABEL: @test4b(
74+
; CHECK-NEXT: entry:
75+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <2 x i32> <i32 0, i32 4>
76+
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[TMP2]], i32 0)
77+
; CHECK-NEXT: ret <2 x i1> [[TMP3]]
78+
;
79+
entry:
80+
%2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
81+
%3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
82+
%4 = shufflevector <4 x i1> %2, <4 x i1> %3, <2 x i32> <i32 0, i32 4>
83+
ret <2 x i1> %4
84+
}
85+
7286
define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
7387
; CHECK-LABEL: @test5(
7488
; CHECK-NEXT: entry:
@@ -84,6 +98,27 @@ entry:
8498
ret <8 x float> %6
8599
}
86100

101+
define <2 x float> @test6(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1, <4 x float> %a2, <4 x float> %b2, <4 x float> %c2) {
102+
; SSE-LABEL: @test6(
103+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], <2 x i32> <i32 0, i32 4>
104+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B1:%.*]], <4 x float> [[B2:%.*]], <2 x i32> <i32 0, i32 4>
105+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[C1:%.*]], <4 x float> [[C2:%.*]], <2 x i32> <i32 0, i32 4>
106+
; SSE-NEXT: [[S:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
107+
; SSE-NEXT: ret <2 x float> [[S]]
108+
;
109+
; AVX-LABEL: @test6(
110+
; AVX-NEXT: [[F1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A1:%.*]], <4 x float> [[B1:%.*]], <4 x float> [[C1:%.*]])
111+
; AVX-NEXT: [[F2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A2:%.*]], <4 x float> [[B2:%.*]], <4 x float> [[C2:%.*]])
112+
; AVX-NEXT: [[S:%.*]] = shufflevector <4 x float> [[F1]], <4 x float> [[F2]], <2 x i32> <i32 0, i32 4>
113+
; AVX-NEXT: ret <2 x float> [[S]]
114+
;
115+
%f1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1)
116+
%f2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a2, <4 x float> %b2, <4 x float> %c2)
117+
%s = shufflevector <4 x float> %f1, <4 x float> %f2, <2 x i32> <i32 0, i32 4>
118+
ret <2 x float> %s
119+
}
120+
121+
87122
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
88123
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
89124
declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)

0 commit comments

Comments
 (0)