Skip to content

Commit 0f27bf2

Browse files
rotaterightllvmbot
authored andcommitted
[x86] enable fast sqrtss/sqrtps tuning for AMD Zen cores
As discussed in D118534, all of the recent AMD CPUs have relatively fast (<14 cycle latency) "sqrtss" and "sqrtps" instructions: https://uops.info/table.html?search=sqrtps&cb_lat=on&cb_tp=on&cb_SNB=on&cb_SKL=on&cb_ZENp=on&cb_ZEN2=on&cb_ZEN3=on&cb_measurements=on&cb_avx=on&cb_sse=on So we should set this tuning flag to alter codegen of plain "sqrt(X)" expansion (as opposed to reciprocal-sqrt - there is other test coverage for that pattern). The expansion is both slower and less accurate than the hardware instruction. Differential Revision: https://reviews.llvm.org/D119001 (cherry picked from commit fff3e1d)
1 parent ebe5b44 commit 0f27bf2

File tree

2 files changed

+21
-164
lines changed

2 files changed

+21
-164
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,6 +1169,8 @@ def ProcessorFeatures {
11691169
TuningFastBEXTR,
11701170
TuningFast15ByteNOP,
11711171
TuningBranchFusion,
1172+
TuningFastScalarFSQRT,
1173+
TuningFastVectorFSQRT,
11721174
TuningFastScalarShiftMasks,
11731175
TuningFastMOVBE,
11741176
TuningSlowSHLD,

llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll

Lines changed: 19 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
; RUN: llc < %s -mtriple=x86_64-- -mcpu=nehalem | FileCheck %s --check-prefixes=NHM
33
; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s --check-prefixes=FAST-SCALAR,SNB
44
; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell | FileCheck %s --check-prefixes=FAST-SCALAR,BDW
5-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=FAST-SCALAR,SKL
6-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=SLOW-SCALAR,ZN1
7-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=SLOW-SCALAR,ZN3
5+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
6+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
7+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
88

99
define float @f32_no_daz(float %f) #0 {
1010
; NHM-LABEL: f32_no_daz:
@@ -26,19 +26,6 @@ define float @f32_no_daz(float %f) #0 {
2626
; FAST-SCALAR: # %bb.0:
2727
; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
2828
; FAST-SCALAR-NEXT: retq
29-
;
30-
; SLOW-SCALAR-LABEL: f32_no_daz:
31-
; SLOW-SCALAR: # %bb.0:
32-
; SLOW-SCALAR-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
33-
; SLOW-SCALAR-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
34-
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm0, %xmm2
35-
; SLOW-SCALAR-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
36-
; SLOW-SCALAR-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
37-
; SLOW-SCALAR-NEXT: vandps %xmm3, %xmm0, %xmm0
38-
; SLOW-SCALAR-NEXT: vcmpltss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
39-
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm2, %xmm1
40-
; SLOW-SCALAR-NEXT: vandnps %xmm1, %xmm0, %xmm0
41-
; SLOW-SCALAR-NEXT: retq
4229
%call = tail call fast float @llvm.sqrt.f32(float %f) #2
4330
ret float %call
4431
}
@@ -91,42 +78,10 @@ define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 {
9178
; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0
9279
; BDW-NEXT: retq
9380
;
94-
; SKL-LABEL: v4f32_no_daz:
95-
; SKL: # %bb.0:
96-
; SKL-NEXT: vsqrtps %xmm0, %xmm0
97-
; SKL-NEXT: retq
98-
;
99-
; ZN1-LABEL: v4f32_no_daz:
100-
; ZN1: # %bb.0:
101-
; ZN1-NEXT: vrsqrtps %xmm0, %xmm1
102-
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
103-
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
104-
; ZN1-NEXT: vmulps %xmm1, %xmm0, %xmm2
105-
; ZN1-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
106-
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
107-
; ZN1-NEXT: vandps %xmm4, %xmm0, %xmm0
108-
; ZN1-NEXT: vmulps %xmm1, %xmm2, %xmm1
109-
; ZN1-NEXT: vmulps %xmm3, %xmm1, %xmm1
110-
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
111-
; ZN1-NEXT: vcmpleps %xmm0, %xmm3, %xmm0
112-
; ZN1-NEXT: vandps %xmm1, %xmm0, %xmm0
113-
; ZN1-NEXT: retq
114-
;
115-
; ZN3-LABEL: v4f32_no_daz:
116-
; ZN3: # %bb.0:
117-
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
118-
; ZN3-NEXT: vrsqrtps %xmm0, %xmm1
119-
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN]
120-
; ZN3-NEXT: vmulps %xmm1, %xmm0, %xmm2
121-
; ZN3-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
122-
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
123-
; ZN3-NEXT: vandps %xmm4, %xmm0, %xmm0
124-
; ZN3-NEXT: vmulps %xmm1, %xmm2, %xmm1
125-
; ZN3-NEXT: vmulps %xmm3, %xmm1, %xmm1
126-
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
127-
; ZN3-NEXT: vcmpleps %xmm0, %xmm3, %xmm0
128-
; ZN3-NEXT: vandps %xmm1, %xmm0, %xmm0
129-
; ZN3-NEXT: retq
81+
; FAST-VECTOR-LABEL: v4f32_no_daz:
82+
; FAST-VECTOR: # %bb.0:
83+
; FAST-VECTOR-NEXT: vsqrtps %xmm0, %xmm0
84+
; FAST-VECTOR-NEXT: retq
13085
%call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
13186
ret <4 x float> %call
13287
}
@@ -194,42 +149,10 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 {
194149
; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0
195150
; BDW-NEXT: retq
196151
;
197-
; SKL-LABEL: v8f32_no_daz:
198-
; SKL: # %bb.0:
199-
; SKL-NEXT: vsqrtps %ymm0, %ymm0
200-
; SKL-NEXT: retq
201-
;
202-
; ZN1-LABEL: v8f32_no_daz:
203-
; ZN1: # %bb.0:
204-
; ZN1-NEXT: vrsqrtps %ymm0, %ymm1
205-
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
206-
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
207-
; ZN1-NEXT: vmulps %ymm1, %ymm0, %ymm2
208-
; ZN1-NEXT: vandps %ymm4, %ymm0, %ymm0
209-
; ZN1-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
210-
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
211-
; ZN1-NEXT: vmulps %ymm1, %ymm2, %ymm1
212-
; ZN1-NEXT: vmulps %ymm3, %ymm1, %ymm1
213-
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
214-
; ZN1-NEXT: vcmpleps %ymm0, %ymm3, %ymm0
215-
; ZN1-NEXT: vandps %ymm1, %ymm0, %ymm0
216-
; ZN1-NEXT: retq
217-
;
218-
; ZN3-LABEL: v8f32_no_daz:
219-
; ZN3: # %bb.0:
220-
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
221-
; ZN3-NEXT: vrsqrtps %ymm0, %ymm1
222-
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
223-
; ZN3-NEXT: vmulps %ymm1, %ymm0, %ymm2
224-
; ZN3-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
225-
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
226-
; ZN3-NEXT: vandps %ymm4, %ymm0, %ymm0
227-
; ZN3-NEXT: vmulps %ymm1, %ymm2, %ymm1
228-
; ZN3-NEXT: vmulps %ymm3, %ymm1, %ymm1
229-
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
230-
; ZN3-NEXT: vcmpleps %ymm0, %ymm3, %ymm0
231-
; ZN3-NEXT: vandps %ymm1, %ymm0, %ymm0
232-
; ZN3-NEXT: retq
152+
; FAST-VECTOR-LABEL: v8f32_no_daz:
153+
; FAST-VECTOR: # %bb.0:
154+
; FAST-VECTOR-NEXT: vsqrtps %ymm0, %ymm0
155+
; FAST-VECTOR-NEXT: retq
233156
%call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
234157
ret <8 x float> %call
235158
}
@@ -256,18 +179,6 @@ define float @f32_daz(float %f) #1 {
256179
; FAST-SCALAR: # %bb.0:
257180
; FAST-SCALAR-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
258181
; FAST-SCALAR-NEXT: retq
259-
;
260-
; SLOW-SCALAR-LABEL: f32_daz:
261-
; SLOW-SCALAR: # %bb.0:
262-
; SLOW-SCALAR-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
263-
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm0, %xmm2
264-
; SLOW-SCALAR-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
265-
; SLOW-SCALAR-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
266-
; SLOW-SCALAR-NEXT: vmulss %xmm1, %xmm2, %xmm1
267-
; SLOW-SCALAR-NEXT: vxorps %xmm2, %xmm2, %xmm2
268-
; SLOW-SCALAR-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
269-
; SLOW-SCALAR-NEXT: vandnps %xmm1, %xmm0, %xmm0
270-
; SLOW-SCALAR-NEXT: retq
271182
%call = tail call fast float @llvm.sqrt.f32(float %f) #2
272183
ret float %call
273184
}
@@ -315,38 +226,10 @@ define <4 x float> @v4f32_daz(<4 x float> %f) #1 {
315226
; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0
316227
; BDW-NEXT: retq
317228
;
318-
; SKL-LABEL: v4f32_daz:
319-
; SKL: # %bb.0:
320-
; SKL-NEXT: vsqrtps %xmm0, %xmm0
321-
; SKL-NEXT: retq
322-
;
323-
; ZN1-LABEL: v4f32_daz:
324-
; ZN1: # %bb.0:
325-
; ZN1-NEXT: vrsqrtps %xmm0, %xmm1
326-
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
327-
; ZN1-NEXT: vmulps %xmm1, %xmm0, %xmm2
328-
; ZN1-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
329-
; ZN1-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
330-
; ZN1-NEXT: vmulps %xmm1, %xmm2, %xmm1
331-
; ZN1-NEXT: vxorps %xmm2, %xmm2, %xmm2
332-
; ZN1-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0
333-
; ZN1-NEXT: vmulps %xmm3, %xmm1, %xmm1
334-
; ZN1-NEXT: vandps %xmm1, %xmm0, %xmm0
335-
; ZN1-NEXT: retq
336-
;
337-
; ZN3-LABEL: v4f32_daz:
338-
; ZN3: # %bb.0:
339-
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
340-
; ZN3-NEXT: vrsqrtps %xmm0, %xmm1
341-
; ZN3-NEXT: vmulps %xmm1, %xmm0, %xmm2
342-
; ZN3-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
343-
; ZN3-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
344-
; ZN3-NEXT: vmulps %xmm1, %xmm2, %xmm1
345-
; ZN3-NEXT: vxorps %xmm2, %xmm2, %xmm2
346-
; ZN3-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0
347-
; ZN3-NEXT: vmulps %xmm3, %xmm1, %xmm1
348-
; ZN3-NEXT: vandps %xmm1, %xmm0, %xmm0
349-
; ZN3-NEXT: retq
229+
; FAST-VECTOR-LABEL: v4f32_daz:
230+
; FAST-VECTOR: # %bb.0:
231+
; FAST-VECTOR-NEXT: vsqrtps %xmm0, %xmm0
232+
; FAST-VECTOR-NEXT: retq
350233
%call = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #2
351234
ret <4 x float> %call
352235
}
@@ -405,38 +288,10 @@ define <8 x float> @v8f32_daz(<8 x float> %f) #1 {
405288
; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0
406289
; BDW-NEXT: retq
407290
;
408-
; SKL-LABEL: v8f32_daz:
409-
; SKL: # %bb.0:
410-
; SKL-NEXT: vsqrtps %ymm0, %ymm0
411-
; SKL-NEXT: retq
412-
;
413-
; ZN1-LABEL: v8f32_daz:
414-
; ZN1: # %bb.0:
415-
; ZN1-NEXT: vrsqrtps %ymm0, %ymm1
416-
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
417-
; ZN1-NEXT: vmulps %ymm1, %ymm0, %ymm2
418-
; ZN1-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
419-
; ZN1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
420-
; ZN1-NEXT: vmulps %ymm1, %ymm2, %ymm1
421-
; ZN1-NEXT: vxorps %xmm2, %xmm2, %xmm2
422-
; ZN1-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
423-
; ZN1-NEXT: vmulps %ymm3, %ymm1, %ymm1
424-
; ZN1-NEXT: vandps %ymm1, %ymm0, %ymm0
425-
; ZN1-NEXT: retq
426-
;
427-
; ZN3-LABEL: v8f32_daz:
428-
; ZN3: # %bb.0:
429-
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
430-
; ZN3-NEXT: vrsqrtps %ymm0, %ymm1
431-
; ZN3-NEXT: vmulps %ymm1, %ymm0, %ymm2
432-
; ZN3-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3
433-
; ZN3-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
434-
; ZN3-NEXT: vmulps %ymm1, %ymm2, %ymm1
435-
; ZN3-NEXT: vxorps %xmm2, %xmm2, %xmm2
436-
; ZN3-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
437-
; ZN3-NEXT: vmulps %ymm3, %ymm1, %ymm1
438-
; ZN3-NEXT: vandps %ymm1, %ymm0, %ymm0
439-
; ZN3-NEXT: retq
291+
; FAST-VECTOR-LABEL: v8f32_daz:
292+
; FAST-VECTOR: # %bb.0:
293+
; FAST-VECTOR-NEXT: vsqrtps %ymm0, %ymm0
294+
; FAST-VECTOR-NEXT: retq
440295
%call = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %f) #2
441296
ret <8 x float> %call
442297
}

0 commit comments

Comments
 (0)