|
| 1 | +commit b398d8e1fa5a5a914957fa22d0a64db97f6c265e |
| 2 | +Author: Craig Topper < [email protected]> |
| 3 | +Date: Thu Mar 8 00:21:17 2018 +0000 |
| 4 | + |
| 5 | + [X86] Fix some isel patterns that used aligned vector load instructions with unaligned predicates. |
| 6 | + |
| 7 | + These patterns weren't checking the alignment of the load, but were using the aligned instructions. This will cause a GP fault if the data isn't aligned. |
| 8 | + |
| 9 | + I believe these were introduced in r312450. |
| 10 | + |
| 11 | + git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@326967 91177308-0d34-0410-b5e6-96231b3b80d8 |
| 12 | + |
| 13 | +diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td |
| 14 | +index db3dfe56531..50c7763a2c3 100644 |
| 15 | +--- a/lib/Target/X86/X86InstrVecCompiler.td |
| 16 | ++++ b/lib/Target/X86/X86InstrVecCompiler.td |
| 17 | +@@ -261,10 +261,10 @@ let Predicates = [HasVLX] in { |
| 18 | + // will zero the upper bits. |
| 19 | + // TODO: Is there a safe way to detect whether the producing instruction |
| 20 | + // already zeroed the upper bits? |
| 21 | +-multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC, |
| 22 | +- ValueType DstTy, ValueType SrcTy, |
| 23 | +- ValueType ZeroTy, PatFrag memop, |
| 24 | +- SubRegIndex SubIdx> { |
| 25 | ++multiclass subvector_zero_lowering<string MoveStr, string LoadStr, |
| 26 | ++ RegisterClass RC, ValueType DstTy, |
| 27 | ++ ValueType SrcTy, ValueType ZeroTy, |
| 28 | ++ PatFrag memop, SubRegIndex SubIdx> { |
| 29 | + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), |
| 30 | + (SrcTy RC:$src), (iPTR 0))), |
| 31 | + (SUBREG_TO_REG (i64 0), |
| 32 | +@@ -274,91 +274,91 @@ multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC, |
| 33 | + (SrcTy (bitconvert (memop addr:$src))), |
| 34 | + (iPTR 0))), |
| 35 | + (SUBREG_TO_REG (i64 0), |
| 36 | +- (!cast<Instruction>("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>; |
| 37 | ++ (!cast<Instruction>("VMOV"#LoadStr#"rm") addr:$src), SubIdx)>; |
| 38 | + } |
| 39 | + |
| 40 | + let Predicates = [HasAVX, NoVLX] in { |
| 41 | +- defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64, |
| 42 | +- sub_xmm>; |
| 43 | +- defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32, |
| 44 | +- sub_xmm>; |
| 45 | +- defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64, |
| 46 | +- sub_xmm>; |
| 47 | +- defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64, |
| 48 | +- sub_xmm>; |
| 49 | +- defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64, |
| 50 | +- sub_xmm>; |
| 51 | +- defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64, |
| 52 | +- sub_xmm>; |
| 53 | +-} |
| 54 | +- |
| 55 | +-let Predicates = [HasVLX] in { |
| 56 | +- defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, |
| 57 | ++ defm : subvector_zero_lowering<"APD", "UPD", VR128, v4f64, v2f64, v8i32, |
| 58 | + loadv2f64, sub_xmm>; |
| 59 | +- defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, |
| 60 | ++ defm : subvector_zero_lowering<"APS", "UPS", VR128, v8f32, v4f32, v8i32, |
| 61 | + loadv4f32, sub_xmm>; |
| 62 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, |
| 63 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v4i64, v2i64, v8i32, |
| 64 | + loadv2i64, sub_xmm>; |
| 65 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, |
| 66 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v8i32, v4i32, v8i32, |
| 67 | + loadv2i64, sub_xmm>; |
| 68 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, |
| 69 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v16i16, v8i16, v8i32, |
| 70 | + loadv2i64, sub_xmm>; |
| 71 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, |
| 72 | +- loadv2i64, sub_xmm>; |
| 73 | +- |
| 74 | +- defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, |
| 75 | +- loadv2f64, sub_xmm>; |
| 76 | +- defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, |
| 77 | +- loadv4f32, sub_xmm>; |
| 78 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, |
| 79 | +- loadv2i64, sub_xmm>; |
| 80 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, |
| 81 | +- loadv2i64, sub_xmm>; |
| 82 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, |
| 83 | +- loadv2i64, sub_xmm>; |
| 84 | +- defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, |
| 85 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v32i8, v16i8, v8i32, |
| 86 | + loadv2i64, sub_xmm>; |
| 87 | ++} |
| 88 | + |
| 89 | +- defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, |
| 90 | +- loadv4f64, sub_ymm>; |
| 91 | +- defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, |
| 92 | +- loadv8f32, sub_ymm>; |
| 93 | +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, |
| 94 | +- loadv4i64, sub_ymm>; |
| 95 | +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, |
| 96 | +- loadv4i64, sub_ymm>; |
| 97 | +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, |
| 98 | +- loadv4i64, sub_ymm>; |
| 99 | +- defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, |
| 100 | +- loadv4i64, sub_ymm>; |
| 101 | ++let Predicates = [HasVLX] in { |
| 102 | ++ defm : subvector_zero_lowering<"APDZ128", "UPDZ128", VR128X, v4f64, |
| 103 | ++ v2f64, v8i32, loadv2f64, sub_xmm>; |
| 104 | ++ defm : subvector_zero_lowering<"APSZ128", "UPSZ128", VR128X, v8f32, |
| 105 | ++ v4f32, v8i32, loadv4f32, sub_xmm>; |
| 106 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v4i64, |
| 107 | ++ v2i64, v8i32, loadv2i64, sub_xmm>; |
| 108 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v8i32, |
| 109 | ++ v4i32, v8i32, loadv2i64, sub_xmm>; |
| 110 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v16i16, |
| 111 | ++ v8i16, v8i32, loadv2i64, sub_xmm>; |
| 112 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v32i8, |
| 113 | ++ v16i8, v8i32, loadv2i64, sub_xmm>; |
| 114 | ++ |
| 115 | ++ defm : subvector_zero_lowering<"APDZ128", "UPDZ128", VR128X, v8f64, |
| 116 | ++ v2f64, v16i32, loadv2f64, sub_xmm>; |
| 117 | ++ defm : subvector_zero_lowering<"APSZ128", "UPSZ128", VR128X, v16f32, |
| 118 | ++ v4f32, v16i32, loadv4f32, sub_xmm>; |
| 119 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v8i64, |
| 120 | ++ v2i64, v16i32, loadv2i64, sub_xmm>; |
| 121 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v16i32, |
| 122 | ++ v4i32, v16i32, loadv2i64, sub_xmm>; |
| 123 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v32i16, |
| 124 | ++ v8i16, v16i32, loadv2i64, sub_xmm>; |
| 125 | ++ defm : subvector_zero_lowering<"DQA64Z128", "DQU64Z128", VR128X, v64i8, |
| 126 | ++ v16i8, v16i32, loadv2i64, sub_xmm>; |
| 127 | ++ |
| 128 | ++ defm : subvector_zero_lowering<"APDZ256", "UPDZ256", VR256X, v8f64, |
| 129 | ++ v4f64, v16i32, loadv4f64, sub_ymm>; |
| 130 | ++ defm : subvector_zero_lowering<"APSZ256", "UPDZ256", VR256X, v16f32, |
| 131 | ++ v8f32, v16i32, loadv8f32, sub_ymm>; |
| 132 | ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v8i64, |
| 133 | ++ v4i64, v16i32, loadv4i64, sub_ymm>; |
| 134 | ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v16i32, |
| 135 | ++ v8i32, v16i32, loadv4i64, sub_ymm>; |
| 136 | ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v32i16, |
| 137 | ++ v16i16, v16i32, loadv4i64, sub_ymm>; |
| 138 | ++ defm : subvector_zero_lowering<"DQA64Z256", "DQU64Z256", VR256X, v64i8, |
| 139 | ++ v32i8, v16i32, loadv4i64, sub_ymm>; |
| 140 | + } |
| 141 | + |
| 142 | + let Predicates = [HasAVX512, NoVLX] in { |
| 143 | +- defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64, |
| 144 | +- sub_xmm>; |
| 145 | +- defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32, |
| 146 | +- sub_xmm>; |
| 147 | +- defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64, |
| 148 | +- sub_xmm>; |
| 149 | +- defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64, |
| 150 | +- sub_xmm>; |
| 151 | +- defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64, |
| 152 | +- sub_xmm>; |
| 153 | +- defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64, |
| 154 | +- sub_xmm>; |
| 155 | +- |
| 156 | +- defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, |
| 157 | +- loadv4f64, sub_ymm>; |
| 158 | +- defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, |
| 159 | +- loadv8f32, sub_ymm>; |
| 160 | +- defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, |
| 161 | +- loadv4i64, sub_ymm>; |
| 162 | +- defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, |
| 163 | +- loadv4i64, sub_ymm>; |
| 164 | +- defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, |
| 165 | +- loadv4i64, sub_ymm>; |
| 166 | +- defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, |
| 167 | +- loadv4i64, sub_ymm>; |
| 168 | ++ defm : subvector_zero_lowering<"APD", "UPD", VR128, v8f64, v2f64, |
| 169 | ++ v16i32,loadv2f64, sub_xmm>; |
| 170 | ++ defm : subvector_zero_lowering<"APS", "UPS", VR128, v16f32, v4f32, |
| 171 | ++ v16i32, loadv4f32, sub_xmm>; |
| 172 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v8i64, v2i64, |
| 173 | ++ v16i32, loadv2i64, sub_xmm>; |
| 174 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v16i32, v4i32, |
| 175 | ++ v16i32, loadv2i64, sub_xmm>; |
| 176 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v32i16, v8i16, |
| 177 | ++ v16i32, loadv2i64, sub_xmm>; |
| 178 | ++ defm : subvector_zero_lowering<"DQA", "DQU", VR128, v64i8, v16i8, |
| 179 | ++ v16i32, loadv2i64, sub_xmm>; |
| 180 | ++ |
| 181 | ++ defm : subvector_zero_lowering<"APDY", "UPDY", VR256, v8f64, v4f64, |
| 182 | ++ v16i32, loadv4f64, sub_ymm>; |
| 183 | ++ defm : subvector_zero_lowering<"APSY", "UPSY", VR256, v16f32, v8f32, |
| 184 | ++ v16i32, loadv8f32, sub_ymm>; |
| 185 | ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v8i64, v4i64, |
| 186 | ++ v16i32, loadv4i64, sub_ymm>; |
| 187 | ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v16i32, v8i32, |
| 188 | ++ v16i32, loadv4i64, sub_ymm>; |
| 189 | ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v32i16, v16i16, |
| 190 | ++ v16i32, loadv4i64, sub_ymm>; |
| 191 | ++ defm : subvector_zero_lowering<"DQAY", "DQUY", VR256, v64i8, v32i8, |
| 192 | ++ v16i32, loadv4i64, sub_ymm>; |
| 193 | + } |
| 194 | + |
| 195 | + // List of opcodes that guaranteed to zero the upper elements of vector regs. |
| 196 | +diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll |
| 197 | +index 6ecd8116443..0f2cf594b1c 100644 |
| 198 | +--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll |
| 199 | ++++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll |
| 200 | +@@ -28,13 +28,13 @@ define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noi |
| 201 | + define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp { |
| 202 | + ; AVX-LABEL: merge_4f64_2f64_2z: |
| 203 | + ; AVX: # %bb.0: |
| 204 | +-; AVX-NEXT: vmovaps 32(%rdi), %xmm0 |
| 205 | ++; AVX-NEXT: vmovups 32(%rdi), %xmm0 |
| 206 | + ; AVX-NEXT: retq |
| 207 | + ; |
| 208 | + ; X32-AVX-LABEL: merge_4f64_2f64_2z: |
| 209 | + ; X32-AVX: # %bb.0: |
| 210 | + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax |
| 211 | +-; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0 |
| 212 | ++; X32-AVX-NEXT: vmovups 32(%eax), %xmm0 |
| 213 | + ; X32-AVX-NEXT: retl |
| 214 | + %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 |
| 215 | + %val0 = load <2 x double>, <2 x double>* %ptr0 |
| 216 | +@@ -109,13 +109,13 @@ define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline |
| 217 | + define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp { |
| 218 | + ; AVX-LABEL: merge_4f64_f64_45zz: |
| 219 | + ; AVX: # %bb.0: |
| 220 | +-; AVX-NEXT: vmovaps 32(%rdi), %xmm0 |
| 221 | ++; AVX-NEXT: vmovups 32(%rdi), %xmm0 |
| 222 | + ; AVX-NEXT: retq |
| 223 | + ; |
| 224 | + ; X32-AVX-LABEL: merge_4f64_f64_45zz: |
| 225 | + ; X32-AVX: # %bb.0: |
| 226 | + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax |
| 227 | +-; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0 |
| 228 | ++; X32-AVX-NEXT: vmovups 32(%eax), %xmm0 |
| 229 | + ; X32-AVX-NEXT: retl |
| 230 | + %ptr0 = getelementptr inbounds double, double* %ptr, i64 4 |
| 231 | + %ptr1 = getelementptr inbounds double, double* %ptr, i64 5 |
| 232 | +@@ -155,13 +155,13 @@ define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline |
| 233 | + define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp { |
| 234 | + ; AVX-LABEL: merge_4i64_2i64_3z: |
| 235 | + ; AVX: # %bb.0: |
| 236 | +-; AVX-NEXT: vmovaps 48(%rdi), %xmm0 |
| 237 | ++; AVX-NEXT: vmovups 48(%rdi), %xmm0 |
| 238 | + ; AVX-NEXT: retq |
| 239 | + ; |
| 240 | + ; X32-AVX-LABEL: merge_4i64_2i64_3z: |
| 241 | + ; X32-AVX: # %bb.0: |
| 242 | + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax |
| 243 | +-; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0 |
| 244 | ++; X32-AVX-NEXT: vmovups 48(%eax), %xmm0 |
| 245 | + ; X32-AVX-NEXT: retl |
| 246 | + %ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3 |
| 247 | + %val0 = load <2 x i64>, <2 x i64>* %ptr0 |
| 248 | +@@ -217,13 +217,13 @@ define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp { |
| 249 | + define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp { |
| 250 | + ; AVX-LABEL: merge_4i64_i64_23zz: |
| 251 | + ; AVX: # %bb.0: |
| 252 | +-; AVX-NEXT: vmovaps 16(%rdi), %xmm0 |
| 253 | ++; AVX-NEXT: vmovups 16(%rdi), %xmm0 |
| 254 | + ; AVX-NEXT: retq |
| 255 | + ; |
| 256 | + ; X32-AVX-LABEL: merge_4i64_i64_23zz: |
| 257 | + ; X32-AVX: # %bb.0: |
| 258 | + ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax |
| 259 | +-; X32-AVX-NEXT: vmovaps 16(%eax), %xmm0 |
| 260 | ++; X32-AVX-NEXT: vmovups 16(%eax), %xmm0 |
| 261 | + ; X32-AVX-NEXT: retl |
| 262 | + %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2 |
| 263 | + %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3 |
| 264 | +diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll |
| 265 | +index 62102eb382c..3c6eaf65292 100644 |
| 266 | +--- a/test/CodeGen/X86/merge-consecutive-loads-512.ll |
| 267 | ++++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll |
| 268 | +@@ -106,13 +106,13 @@ define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noin |
| 269 | + define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp { |
| 270 | + ; ALL-LABEL: merge_8f64_f64_12zzuuzz: |
| 271 | + ; ALL: # %bb.0: |
| 272 | +-; ALL-NEXT: vmovaps 8(%rdi), %xmm0 |
| 273 | ++; ALL-NEXT: vmovups 8(%rdi), %xmm0 |
| 274 | + ; ALL-NEXT: retq |
| 275 | + ; |
| 276 | + ; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz: |
| 277 | + ; X32-AVX512F: # %bb.0: |
| 278 | + ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax |
| 279 | +-; X32-AVX512F-NEXT: vmovaps 8(%eax), %xmm0 |
| 280 | ++; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0 |
| 281 | + ; X32-AVX512F-NEXT: retl |
| 282 | + %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 |
| 283 | + %ptr1 = getelementptr inbounds double, double* %ptr, i64 2 |
| 284 | +@@ -190,7 +190,7 @@ define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline |
| 285 | + define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp { |
| 286 | + ; ALL-LABEL: merge_8i64_i64_56zz9uzz: |
| 287 | + ; ALL: # %bb.0: |
| 288 | +-; ALL-NEXT: vmovaps 40(%rdi), %xmm0 |
| 289 | ++; ALL-NEXT: vmovups 40(%rdi), %xmm0 |
| 290 | + ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| 291 | + ; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 |
| 292 | + ; ALL-NEXT: retq |
| 293 | +@@ -198,7 +198,7 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s |
| 294 | + ; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz: |
| 295 | + ; X32-AVX512F: # %bb.0: |
| 296 | + ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax |
| 297 | +-; X32-AVX512F-NEXT: vmovaps 40(%eax), %xmm0 |
| 298 | ++; X32-AVX512F-NEXT: vmovups 40(%eax), %xmm0 |
| 299 | + ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| 300 | + ; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 |
| 301 | + ; X32-AVX512F-NEXT: retl |
0 commit comments