Skip to content

Commit aeb8141

Browse files
KenoKristofferC
authored andcommitted
Carry LLVM patch to fix incorrect codegen
This is rL326967 to fix #28726. (cherry picked from commit e99204b)
1 parent 6bdf13e commit aeb8141

File tree

2 files changed

+302
-0
lines changed

2 files changed

+302
-0
lines changed

deps/llvm.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,7 @@ $(eval $(call LLVM_PATCH,llvm-rL323946-LSRTy)) # Remove for 7.0
507507
$(eval $(call LLVM_PATCH,llvm-D50010-VNCoercion-ni))
508508
$(eval $(call LLVM_PATCH,llvm-D50167-scev-umin))
509509
$(eval $(call LLVM_PATCH,llvm-windows-race))
510+
$(eval $(call LLVM_PATCH,llvm-rL326967-aligned-load)) # remove for 7.0
510511
endif # LLVM_VER
511512

512513
# Remove hardcoded OS X requirements in compilter-rt cmake build
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
commit b398d8e1fa5a5a914957fa22d0a64db97f6c265e
2+
Author: Craig Topper <[email protected]>
3+
Date: Thu Mar 8 00:21:17 2018 +0000
4+
5+
[X86] Fix some isel patterns that used aligned vector load instructions with unaligned predicates.
6+
7+
These patterns weren't checking the alignment of the load, but were using the aligned instructions. This will cause a GP fault if the data isn't aligned.
8+
9+
I believe these were introduced in r312450.
10+
11+
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326967 91177308-0d34-0410-b5e6-96231b3b80d8
12+
13+
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
14+
index db3dfe56531..50c7763a2c3 100644
15+
--- a/lib/Target/X86/X86InstrVecCompiler.td
16+
+++ b/lib/Target/X86/X86InstrVecCompiler.td
17+
@@ -261,10 +261,10 @@ let Predicates = [HasVLX] in {
18+
// will zero the upper bits.
19+
// TODO: Is there a safe way to detect whether the producing instruction
20+
// already zeroed the upper bits?
21+
-multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC,
22+
- ValueType DstTy, ValueType SrcTy,
23+
- ValueType ZeroTy, PatFrag memop,
24+
- SubRegIndex SubIdx> {
25+
+multiclass subvector_zero_lowering<string MoveStr, string LoadStr,
26+
+ RegisterClass RC, ValueType DstTy,
27+
+ ValueType SrcTy, ValueType ZeroTy,
28+
+ PatFrag memop, SubRegIndex SubIdx> {
29+
def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
30+
(SrcTy RC:$src), (iPTR 0))),
31+
(SUBREG_TO_REG (i64 0),
32+
@@ -274,91 +274,91 @@ multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC,
33+
(SrcTy (bitconvert (memop addr:$src))),
34+
(iPTR 0))),
35+
(SUBREG_TO_REG (i64 0),
36+
- (!cast<Instruction>("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>;
37+
+ (!cast<Instruction>("VMOV"#LoadStr#"rm") addr:$src), SubIdx)>;
38+
}
39+
40+
let Predicates = [HasAVX, NoVLX] in {
41+
- defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64,
42+
- sub_xmm>;
43+
- defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32,
44+
- sub_xmm>;
45+
- defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64,
46+
- sub_xmm>;
47+
- defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64,
48+
- sub_xmm>;
49+
- defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64,
50+
- sub_xmm>;
51+
- defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64,
52+
- sub_xmm>;
53+
-}
54+
-
55+
-let Predicates = [HasVLX] in {
56+
- defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32,
57+
+ defm : subvector_zero_lowering<"APD", "UPD", VR128, v4f64, v2f64, v8i32,
58+
loadv2f64, sub_xmm>;
59+
- defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32,
60+
+ defm : subvector_zero_lowering<"APS", "UPS", VR128, v8f32, v4f32, v8i32,
61+
loadv4f32, sub_xmm>;
62+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32,
63+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v4i64, v2i64, v8i32,
64+
loadv2i64, sub_xmm>;
65+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32,
66+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v8i32, v4i32, v8i32,
67+
loadv2i64, sub_xmm>;
68+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32,
69+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v16i16, v8i16, v8i32,
70+
loadv2i64, sub_xmm>;
71+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32,
72+
- loadv2i64, sub_xmm>;
73+
-
74+
- defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32,
75+
- loadv2f64, sub_xmm>;
76+
- defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32,
77+
- loadv4f32, sub_xmm>;
78+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32,
79+
- loadv2i64, sub_xmm>;
80+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32,
81+
- loadv2i64, sub_xmm>;
82+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32,
83+
- loadv2i64, sub_xmm>;
84+
- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32,
85+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v32i8, v16i8, v8i32,
86+
loadv2i64, sub_xmm>;
87+
+}
88+
89+
- defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32,
90+
- loadv4f64, sub_ymm>;
91+
- defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32,
92+
- loadv8f32, sub_ymm>;
93+
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32,
94+
- loadv4i64, sub_ymm>;
95+
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32,
96+
- loadv4i64, sub_ymm>;
97+
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32,
98+
- loadv4i64, sub_ymm>;
99+
- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32,
100+
- loadv4i64, sub_ymm>;
101+
+let Predicates = [HasVLX] in {
102+
+ defm : subvector_zero_lowering<"APDZ128", "UPDZ128", VR128X, v4f64,
103+
+ v2f64, v8i32, loadv2f64, sub_xmm>;
104+
+ defm : subvector_zero_lowering<"APSZ128", "UPSZ128", VR128X, v8f32,
105+
+ v4f32, v8i32, loadv4f32, sub_xmm>;
106+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v4i64,
107+
+ v2i64, v8i32, loadv2i64, sub_xmm>;
108+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v8i32,
109+
+ v4i32, v8i32, loadv2i64, sub_xmm>;
110+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v16i16,
111+
+ v8i16, v8i32, loadv2i64, sub_xmm>;
112+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v32i8,
113+
+ v16i8, v8i32, loadv2i64, sub_xmm>;
114+
+
115+
+ defm : subvector_zero_lowering<"APDZ128", "UPDZ128", VR128X, v8f64,
116+
+ v2f64, v16i32, loadv2f64, sub_xmm>;
117+
+ defm : subvector_zero_lowering<"APSZ128", "UPSZ128", VR128X, v16f32,
118+
+ v4f32, v16i32, loadv4f32, sub_xmm>;
119+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v8i64,
120+
+ v2i64, v16i32, loadv2i64, sub_xmm>;
121+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v16i32,
122+
+ v4i32, v16i32, loadv2i64, sub_xmm>;
123+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v32i16,
124+
+ v8i16, v16i32, loadv2i64, sub_xmm>;
125+
+ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v64i8,
126+
+ v16i8, v16i32, loadv2i64, sub_xmm>;
127+
+
128+
+ defm : subvector_zero_lowering<"APDZ256", "UPDZ256", VR256X, v8f64,
129+
+ v4f64, v16i32, loadv4f64, sub_ymm>;
130+
+ defm : subvector_zero_lowering<"APSZ256", "UPDZ256", VR256X, v16f32,
131+
+ v8f32, v16i32, loadv8f32, sub_ymm>;
132+
+ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v8i64,
133+
+ v4i64, v16i32, loadv4i64, sub_ymm>;
134+
+ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v16i32,
135+
+ v8i32, v16i32, loadv4i64, sub_ymm>;
136+
+ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v32i16,
137+
+ v16i16, v16i32, loadv4i64, sub_ymm>;
138+
+ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v64i8,
139+
+ v32i8, v16i32, loadv4i64, sub_ymm>;
140+
}
141+
142+
let Predicates = [HasAVX512, NoVLX] in {
143+
- defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64,
144+
- sub_xmm>;
145+
- defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32,
146+
- sub_xmm>;
147+
- defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64,
148+
- sub_xmm>;
149+
- defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64,
150+
- sub_xmm>;
151+
- defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64,
152+
- sub_xmm>;
153+
- defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64,
154+
- sub_xmm>;
155+
-
156+
- defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32,
157+
- loadv4f64, sub_ymm>;
158+
- defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32,
159+
- loadv8f32, sub_ymm>;
160+
- defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32,
161+
- loadv4i64, sub_ymm>;
162+
- defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32,
163+
- loadv4i64, sub_ymm>;
164+
- defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32,
165+
- loadv4i64, sub_ymm>;
166+
- defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32,
167+
- loadv4i64, sub_ymm>;
168+
+ defm : subvector_zero_lowering<"APD", "UPD", VR128, v8f64, v2f64,
169+
+ v16i32,loadv2f64, sub_xmm>;
170+
+ defm : subvector_zero_lowering<"APS", "UPS", VR128, v16f32, v4f32,
171+
+ v16i32, loadv4f32, sub_xmm>;
172+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v8i64, v2i64,
173+
+ v16i32, loadv2i64, sub_xmm>;
174+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v16i32, v4i32,
175+
+ v16i32, loadv2i64, sub_xmm>;
176+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v32i16, v8i16,
177+
+ v16i32, loadv2i64, sub_xmm>;
178+
+ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v64i8, v16i8,
179+
+ v16i32, loadv2i64, sub_xmm>;
180+
+
181+
+ defm : subvector_zero_lowering<"APDY", "UPDY", VR256, v8f64, v4f64,
182+
+ v16i32, loadv4f64, sub_ymm>;
183+
+ defm : subvector_zero_lowering<"APSY", "UPSY", VR256, v16f32, v8f32,
184+
+ v16i32, loadv8f32, sub_ymm>;
185+
+ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v8i64, v4i64,
186+
+ v16i32, loadv4i64, sub_ymm>;
187+
+ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v16i32, v8i32,
188+
+ v16i32, loadv4i64, sub_ymm>;
189+
+ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v32i16, v16i16,
190+
+ v16i32, loadv4i64, sub_ymm>;
191+
+ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v64i8, v32i8,
192+
+ v16i32, loadv4i64, sub_ymm>;
193+
}
194+
195+
// List of opcodes that guaranteed to zero the upper elements of vector regs.
196+
diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll
197+
index 6ecd8116443..0f2cf594b1c 100644
198+
--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll
199+
+++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll
200+
@@ -28,13 +28,13 @@ define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noi
201+
define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp {
202+
; AVX-LABEL: merge_4f64_2f64_2z:
203+
; AVX: # %bb.0:
204+
-; AVX-NEXT: vmovaps 32(%rdi), %xmm0
205+
+; AVX-NEXT: vmovups 32(%rdi), %xmm0
206+
; AVX-NEXT: retq
207+
;
208+
; X32-AVX-LABEL: merge_4f64_2f64_2z:
209+
; X32-AVX: # %bb.0:
210+
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
211+
-; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0
212+
+; X32-AVX-NEXT: vmovups 32(%eax), %xmm0
213+
; X32-AVX-NEXT: retl
214+
%ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
215+
%val0 = load <2 x double>, <2 x double>* %ptr0
216+
@@ -109,13 +109,13 @@ define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline
217+
define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp {
218+
; AVX-LABEL: merge_4f64_f64_45zz:
219+
; AVX: # %bb.0:
220+
-; AVX-NEXT: vmovaps 32(%rdi), %xmm0
221+
+; AVX-NEXT: vmovups 32(%rdi), %xmm0
222+
; AVX-NEXT: retq
223+
;
224+
; X32-AVX-LABEL: merge_4f64_f64_45zz:
225+
; X32-AVX: # %bb.0:
226+
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
227+
-; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0
228+
+; X32-AVX-NEXT: vmovups 32(%eax), %xmm0
229+
; X32-AVX-NEXT: retl
230+
%ptr0 = getelementptr inbounds double, double* %ptr, i64 4
231+
%ptr1 = getelementptr inbounds double, double* %ptr, i64 5
232+
@@ -155,13 +155,13 @@ define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline
233+
define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp {
234+
; AVX-LABEL: merge_4i64_2i64_3z:
235+
; AVX: # %bb.0:
236+
-; AVX-NEXT: vmovaps 48(%rdi), %xmm0
237+
+; AVX-NEXT: vmovups 48(%rdi), %xmm0
238+
; AVX-NEXT: retq
239+
;
240+
; X32-AVX-LABEL: merge_4i64_2i64_3z:
241+
; X32-AVX: # %bb.0:
242+
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
243+
-; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0
244+
+; X32-AVX-NEXT: vmovups 48(%eax), %xmm0
245+
; X32-AVX-NEXT: retl
246+
%ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3
247+
%val0 = load <2 x i64>, <2 x i64>* %ptr0
248+
@@ -217,13 +217,13 @@ define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp {
249+
define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
250+
; AVX-LABEL: merge_4i64_i64_23zz:
251+
; AVX: # %bb.0:
252+
-; AVX-NEXT: vmovaps 16(%rdi), %xmm0
253+
+; AVX-NEXT: vmovups 16(%rdi), %xmm0
254+
; AVX-NEXT: retq
255+
;
256+
; X32-AVX-LABEL: merge_4i64_i64_23zz:
257+
; X32-AVX: # %bb.0:
258+
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
259+
-; X32-AVX-NEXT: vmovaps 16(%eax), %xmm0
260+
+; X32-AVX-NEXT: vmovups 16(%eax), %xmm0
261+
; X32-AVX-NEXT: retl
262+
%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2
263+
%ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3
264+
diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll
265+
index 62102eb382c..3c6eaf65292 100644
266+
--- a/test/CodeGen/X86/merge-consecutive-loads-512.ll
267+
+++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll
268+
@@ -106,13 +106,13 @@ define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noin
269+
define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp {
270+
; ALL-LABEL: merge_8f64_f64_12zzuuzz:
271+
; ALL: # %bb.0:
272+
-; ALL-NEXT: vmovaps 8(%rdi), %xmm0
273+
+; ALL-NEXT: vmovups 8(%rdi), %xmm0
274+
; ALL-NEXT: retq
275+
;
276+
; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz:
277+
; X32-AVX512F: # %bb.0:
278+
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
279+
-; X32-AVX512F-NEXT: vmovaps 8(%eax), %xmm0
280+
+; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
281+
; X32-AVX512F-NEXT: retl
282+
%ptr0 = getelementptr inbounds double, double* %ptr, i64 1
283+
%ptr1 = getelementptr inbounds double, double* %ptr, i64 2
284+
@@ -190,7 +190,7 @@ define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline
285+
define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp {
286+
; ALL-LABEL: merge_8i64_i64_56zz9uzz:
287+
; ALL: # %bb.0:
288+
-; ALL-NEXT: vmovaps 40(%rdi), %xmm0
289+
+; ALL-NEXT: vmovups 40(%rdi), %xmm0
290+
; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
291+
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
292+
; ALL-NEXT: retq
293+
@@ -198,7 +198,7 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s
294+
; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz:
295+
; X32-AVX512F: # %bb.0:
296+
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
297+
-; X32-AVX512F-NEXT: vmovaps 40(%eax), %xmm0
298+
+; X32-AVX512F-NEXT: vmovups 40(%eax), %xmm0
299+
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
300+
; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
301+
; X32-AVX512F-NEXT: retl

0 commit comments

Comments
 (0)