Skip to content

Commit 7f21678

Browse files
committed
[SLP]Improve minbitwidth analysis.
This improves overall analysis for minbitwidth in SLP. It allows to analyze the trees with store/insertelement root nodes. Also, instead of using single minbitwidth, detected from the very first analysis stage, it tries to detect the best one for each trunc/ext subtree in the graph and use it for the subtree. Results in better code and less vector register pressure. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant.test 92549.00 92609.00 0.1% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 663381.00 663493.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 663381.00 663493.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 307182.00 307214.00 0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1394420.00 1394484.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2040257.00 2040273.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12396098.00 12395858.00 -0.0% test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test 909944.00 909768.00 -0.0% SingleSource/Benchmarks/Adobe-C++/simple_types_loop_invariant - 4 scalar instructions remain scalar (good). Spec2017/x264 - the whole function idct4x4dc is vectorized using <16 x i16> instead of <16 x i32>, also zext/trunc are removed. In other places last vector zext/sext removed and replaced by extractelement + scalar zext/sext pair. MultiSource/Benchmarks/Bullet/bullet - reduce or <4 x i32> replaced by reduce or <4 x i8> Spec2017/imagick - Removed extra zext from 2 packs of the operations. Spec2017/parest - Removed extra zext, replaced by extractelement+scalar zext Spec2017/blender - the whole bunch of vector zext/sext replaced by extractelement+scalar zext/sext, some extra code vectorized in smaller types. Spec2006/gobmk - fixed cost estimation, some small code remains scalar. Original Pull Request: #84334 The patch has the same functionality (no test changes, no changes in benchmarks) as the original patch, just has some compile time improvements + fixes for xxhash unittest, discovered earlier in the previous version of the patch. Reviewers: Pull Request: #84536
1 parent 21d8085 commit 7f21678

18 files changed

+675
-333
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 547 additions & 210 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
1717
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
1818
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
1919
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
20-
; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64>
21-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
20+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
21+
; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
2222
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
2323
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
24-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
25-
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]]
24+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
25+
; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
26+
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
2627
; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
2728
; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
2829
; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64

llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@ define void @h() {
55
; CHECK-LABEL: define void @h() {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
8-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 0, i32 0
9-
; CHECK-NEXT: [[TMP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP0]]
10-
; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
11-
; CHECK-NEXT: [[TMP3:%.*]] = trunc <8 x i32> [[TMP2]] to <8 x i16>
8+
; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 0 to i1
9+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i1> <i1 undef, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, i1 [[TMP0]], i32 0
10+
; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i1> zeroinitializer, [[TMP1]]
11+
; CHECK-NEXT: [[TMP4:%.*]] = or <8 x i1> [[TMP2]], zeroinitializer
12+
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i1> [[TMP4]] to <8 x i16>
1213
; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
1314
; CHECK-NEXT: ret void
1415
;

llvm/test/Transforms/SLPVectorizer/AArch64/gather-with-minbith-user.ll

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@ define void @h() {
55
; CHECK-LABEL: define void @h() {
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
8-
; CHECK-NEXT: [[TMP0:%.*]] = sub <8 x i32> zeroinitializer, zeroinitializer
9-
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> zeroinitializer, zeroinitializer
10-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
11-
; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i32> [[TMP2]], zeroinitializer
12-
; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i16>
8+
; CHECK-NEXT: [[TMP0:%.*]] = trunc <8 x i32> zeroinitializer to <8 x i1>
9+
; CHECK-NEXT: [[TMP1:%.*]] = sub <8 x i1> [[TMP0]], zeroinitializer
10+
; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i1> [[TMP0]], zeroinitializer
11+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
12+
; CHECK-NEXT: [[TMP5:%.*]] = or <8 x i1> [[TMP3]], zeroinitializer
13+
; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i1> [[TMP5]] to <8 x i16>
1314
; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[ARRAYIDX2]], align 2
1415
; CHECK-NEXT: ret void
1516
;

llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
3-
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
3+
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
44
; RUN: cat %t | FileCheck -check-prefix=YAML %s
5-
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
5+
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
66
; RUN: cat %t | FileCheck -check-prefix=YAML %s
77

88

llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,11 @@ entry:
2828
define i64 @red_zext_ld_4xi64(ptr %ptr) {
2929
; CHECK-LABEL: @red_zext_ld_4xi64(
3030
; CHECK-NEXT: entry:
31-
; CHECK-NEXT: [[LD0:%.*]] = load i8, ptr [[PTR:%.*]], align 1
32-
; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 [[LD0]] to i64
33-
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
34-
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP]], align 1
35-
; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i8 [[LD1]] to i64
36-
; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i64 [[ZEXT]], [[ZEXT_1]]
37-
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 2
38-
; CHECK-NEXT: [[LD2:%.*]] = load i8, ptr [[GEP_1]], align 1
39-
; CHECK-NEXT: [[ZEXT_2:%.*]] = zext i8 [[LD2]] to i64
40-
; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i64 [[ADD_1]], [[ZEXT_2]]
41-
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 3
42-
; CHECK-NEXT: [[LD3:%.*]] = load i8, ptr [[GEP_2]], align 1
43-
; CHECK-NEXT: [[ZEXT_3:%.*]] = zext i8 [[LD3]] to i64
44-
; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i64 [[ADD_2]], [[ZEXT_3]]
45-
; CHECK-NEXT: ret i64 [[ADD_3]]
31+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
32+
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
33+
; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
34+
; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
35+
; CHECK-NEXT: ret i64 [[TMP3]]
4636
;
4737
entry:
4838
%ld0 = load i8, ptr %ptr

llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -802,9 +802,10 @@ define i64 @red_zext_ld_4xi64(ptr %ptr) {
802802
; CHECK-LABEL: @red_zext_ld_4xi64(
803803
; CHECK-NEXT: entry:
804804
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PTR:%.*]], align 1
805-
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i64>
806-
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]])
807-
; CHECK-NEXT: ret i64 [[TMP2]]
805+
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
806+
; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP1]])
807+
; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i64
808+
; CHECK-NEXT: ret i64 [[TMP3]]
808809
;
809810
entry:
810811
%ld0 = load i8, ptr %ptr

llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@ define { i64, i64 } @patatino(double %arg) {
1515
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([6 x double], ptr @global, i64 0, i64 4), align 16
1616
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
1717
; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
18-
; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
19-
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
18+
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
19+
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
2020
; CHECK-NEXT: [[T16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
21-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
22-
; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP11]], 1
21+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP8]], i32 1
22+
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
23+
; CHECK-NEXT: [[T17:%.*]] = insertvalue { i64, i64 } [[T16]], i64 [[TMP12]], 1
2324
; CHECK-NEXT: ret { i64, i64 } [[T17]]
2425
;
2526
bb:

llvm/test/Transforms/SLPVectorizer/X86/int-bitcast-minbitwidth.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-3 < %s | FileCheck %s
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-6 < %s | FileCheck %s
33

44
define void @t(i64 %v) {
55
; CHECK-LABEL: define void @t(

llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-multiuse-with-insertelement.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,17 @@ define void @test(i8 %0) {
66
; CHECK-SAME: i8 [[TMP0:%.*]]) {
77
; CHECK-NEXT: entry:
88
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> <i8 0, i8 poison>, i8 [[TMP0]], i32 1
9-
; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i16>
10-
; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i32>
11-
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i16> [[TMP2]], zeroinitializer
12-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
13-
; CHECK-NEXT: [[TMP6:%.*]] = zext i16 [[TMP5]] to i32
14-
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
15-
; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32
16-
; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP6]], [[TMP8]]
9+
; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i8> [[TMP1]] to <2 x i32>
10+
; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i8> [[TMP1]], zeroinitializer
11+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
12+
; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
13+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
14+
; CHECK-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
15+
; CHECK-NEXT: [[ADD:%.*]] = or i32 [[TMP5]], [[TMP7]]
1716
; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[ADD]], 1
1817
; CHECK-NEXT: [[CONV9:%.*]] = trunc i32 [[SHR]] to i8
1918
; CHECK-NEXT: store i8 [[CONV9]], ptr null, align 1
20-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
19+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
2120
; CHECK-NEXT: ret void
2221
;
2322
entry:

0 commit comments

Comments
 (0)