Skip to content

Commit aee8522

Browse files
ngzhianCommit Bot
authored andcommitted
[wasm-simd][x64] Prototype extended pairwise addition
Add new macro-assembler instructions that can handle both AVX and SSE. In the SSE case it checks that dst == src1. (This is different from that the AvxHelper does, which passes dst as the first operand to AVX instructions.) Sorted SSSE3_INSTRUCTION_LIST by instruction code. Header additions are added by clangd, we were already using something from those headers via transitive includes, adding them explicitly gets us closer to IWYU. Codegen sequences are from WebAssembly/simd#380 and also WebAssembly/simd#380 (comment). Bug: v8:11086 Change-Id: I4c04f836e471ed8b00f9ff1a1b2e6348a593d4de Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2578797 Commit-Queue: Zhi An Ng <[email protected]> Reviewed-by: Bill Budge <[email protected]> Cr-Commit-Position: refs/heads/master@{#71688}
1 parent 26f9016 commit aee8522

File tree

9 files changed

+132
-8
lines changed

9 files changed

+132
-8
lines changed

src/codegen/x64/macro-assembler-x64.cc

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1727,6 +1727,29 @@ void TurboAssembler::RetpolineJump(Register reg) {
17271727
ret(0);
17281728
}
17291729

1730+
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
1731+
XMMRegister src2) {
1732+
if (CpuFeatures::IsSupported(AVX)) {
1733+
CpuFeatureScope avx_scope(this, AVX);
1734+
vpmaddwd(dst, src1, src2);
1735+
} else {
1736+
DCHECK_EQ(dst, src1);
1737+
pmaddwd(dst, src2);
1738+
}
1739+
}
1740+
1741+
void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
1742+
XMMRegister src2) {
1743+
if (CpuFeatures::IsSupported(AVX)) {
1744+
CpuFeatureScope avx_scope(this, AVX);
1745+
vpmaddubsw(dst, src1, src2);
1746+
} else {
1747+
CpuFeatureScope ssse3_scope(this, SSSE3);
1748+
DCHECK_EQ(dst, src1);
1749+
pmaddubsw(dst, src2);
1750+
}
1751+
}
1752+
17301753
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src, byte imm8) {
17311754
if (CpuFeatures::IsSupported(AVX)) {
17321755
CpuFeatureScope avx_scope(this, AVX);
@@ -1956,11 +1979,16 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
19561979
}
19571980

19581981
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
1982+
Psrld(dst, dst, imm8);
1983+
}
1984+
1985+
void TurboAssembler::Psrld(XMMRegister dst, XMMRegister src, byte imm8) {
19591986
if (CpuFeatures::IsSupported(AVX)) {
19601987
CpuFeatureScope scope(this, AVX);
1961-
vpsrld(dst, dst, imm8);
1988+
vpsrld(dst, src, imm8);
19621989
} else {
19631990
DCHECK(!IsEnabled(AVX));
1991+
DCHECK_EQ(dst, src);
19641992
psrld(dst, imm8);
19651993
}
19661994
}

src/codegen/x64/macro-assembler-x64.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
208208
AVX_OP(Psrlw, psrlw)
209209
AVX_OP(Psrld, psrld)
210210
AVX_OP(Psrlq, psrlq)
211-
AVX_OP(Pmaddwd, pmaddwd)
212211
AVX_OP(Paddb, paddb)
213212
AVX_OP(Paddw, paddw)
214213
AVX_OP(Paddd, paddd)
@@ -522,6 +521,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
522521
void Trap() override;
523522
void DebugBreak() override;
524523

524+
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
525+
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
526+
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
527+
525528
// Shufps that will mov src into dst if AVX is not supported.
526529
void Shufps(XMMRegister dst, XMMRegister src, byte imm8);
527530

@@ -546,6 +549,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
546549
void Pslld(XMMRegister dst, byte imm8);
547550
void Psrld(XMMRegister dst, byte imm8);
548551

552+
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
553+
void Psrld(XMMRegister dst, XMMRegister src, byte imm8);
554+
549555
void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
550556
XMMRegister mask);
551557
void Blendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2,

src/codegen/x64/sse-instr.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,10 @@
133133
V(maxsd, F2, 0F, 5F)
134134

135135
#define SSSE3_INSTRUCTION_LIST(V) \
136-
V(phaddd, 66, 0F, 38, 02) \
137-
V(phaddw, 66, 0F, 38, 01) \
138136
V(pshufb, 66, 0F, 38, 00) \
137+
V(phaddw, 66, 0F, 38, 01) \
138+
V(phaddd, 66, 0F, 38, 02) \
139+
V(pmaddubsw, 66, 0F, 38, 04) \
139140
V(psignb, 66, 0F, 38, 08) \
140141
V(psignw, 66, 0F, 38, 09) \
141142
V(psignd, 66, 0F, 38, 0A)

src/compiler/backend/instruction-selector.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2808,7 +2808,7 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
28082808
}
28092809
#endif // !V8_TARGET_ARCH_ARM64
28102810

2811-
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
2811+
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
28122812
// TODO(v8:11086) Prototype extended pairwise add.
28132813
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
28142814
UNIMPLEMENTED();
@@ -2822,7 +2822,7 @@ void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
28222822
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
28232823
UNIMPLEMENTED();
28242824
}
2825-
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
2825+
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
28262826

28272827
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \
28282828
!V8_TARGET_ARCH_ARM

src/compiler/backend/x64/code-generator-x64.cc

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "src/codegen/macro-assembler.h"
1010
#include "src/codegen/optimized-compilation-info.h"
1111
#include "src/codegen/x64/assembler-x64.h"
12+
#include "src/codegen/x64/register-x64.h"
1213
#include "src/compiler/backend/code-generator-impl.h"
1314
#include "src/compiler/backend/code-generator.h"
1415
#include "src/compiler/backend/gap-resolver.h"
@@ -3085,6 +3086,35 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
30853086
ASSEMBLE_SIMD_BINOP(pmaddwd);
30863087
break;
30873088
}
3089+
case kX64I32x4ExtAddPairwiseI16x8S: {
3090+
XMMRegister dst = i.OutputSimd128Register();
3091+
XMMRegister src = i.InputSimd128Register(0);
3092+
// kScratchDoubleReg = |1|1|1|1|1|1|1|1|
3093+
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
3094+
__ Psrlw(kScratchDoubleReg, byte{15});
3095+
// pmaddwd multiplies signed words in kScratchDoubleReg and src, producing
3096+
// signed doublewords, then adds pairwise.
3097+
// src = |a|b|c|d|e|f|g|h|
3098+
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
3099+
__ Pmaddwd(dst, src, kScratchDoubleReg);
3100+
break;
3101+
}
3102+
case kX64I32x4ExtAddPairwiseI16x8U: {
3103+
XMMRegister dst = i.OutputSimd128Register();
3104+
XMMRegister src = i.InputSimd128Register(0);
3105+
3106+
// src = |a|b|c|d|e|f|g|h|
3107+
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
3108+
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
3109+
__ Psrld(kScratchDoubleReg, byte{16});
3110+
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
3111+
__ Pand(kScratchDoubleReg, src);
3112+
// dst = |0|a|0|c|0|e|0|g|
3113+
__ Psrld(dst, src, byte{16});
3114+
// dst = |a+b|c+d|e+f|g+h|
3115+
__ Paddd(dst, kScratchDoubleReg);
3116+
break;
3117+
}
30883118
case kX64S128Const: {
30893119
// Emit code for generic constants as all zeros, or ones cases will be
30903120
// handled separately by the selector.
@@ -3297,6 +3327,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
32973327
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw);
32983328
break;
32993329
}
3330+
case kX64I16x8ExtAddPairwiseI8x16S: {
3331+
XMMRegister dst = i.OutputSimd128Register();
3332+
XMMRegister src = i.InputSimd128Register(0);
3333+
DCHECK_NE(dst, src);
3334+
// dst = i8x16.splat(1)
3335+
__ Move(dst, uint32_t{0x01010101});
3336+
__ Pshufd(dst, dst, byte{0});
3337+
__ Pmaddubsw(dst, dst, src);
3338+
break;
3339+
}
3340+
case kX64I16x8ExtAddPairwiseI8x16U: {
3341+
XMMRegister dst = i.OutputSimd128Register();
3342+
XMMRegister src = i.InputSimd128Register(0);
3343+
// dst = i8x16.splat(1)
3344+
__ Move(kScratchDoubleReg, uint32_t{0x01010101});
3345+
__ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
3346+
__ Pmaddubsw(dst, src, kScratchDoubleReg);
3347+
break;
3348+
}
33003349
case kX64I8x16Splat: {
33013350
XMMRegister dst = i.OutputSimd128Register();
33023351
if (HasRegisterInput(instr, 0)) {

src/compiler/backend/x64/instruction-codes-x64.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,8 @@ namespace compiler {
250250
V(X64I32x4ExtMulHighI16x8S) \
251251
V(X64I32x4ExtMulLowI16x8U) \
252252
V(X64I32x4ExtMulHighI16x8U) \
253+
V(X64I32x4ExtAddPairwiseI16x8S) \
254+
V(X64I32x4ExtAddPairwiseI16x8U) \
253255
V(X64I16x8Splat) \
254256
V(X64I16x8ExtractLaneS) \
255257
V(X64I16x8SConvertI8x16Low) \
@@ -288,6 +290,8 @@ namespace compiler {
288290
V(X64I16x8ExtMulHighI8x16S) \
289291
V(X64I16x8ExtMulLowI8x16U) \
290292
V(X64I16x8ExtMulHighI8x16U) \
293+
V(X64I16x8ExtAddPairwiseI8x16S) \
294+
V(X64I16x8ExtAddPairwiseI8x16U) \
291295
V(X64I8x16Splat) \
292296
V(X64I8x16ExtractLaneS) \
293297
V(X64Pinsrb) \

src/compiler/backend/x64/instruction-scheduler-x64.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
226226
case kX64I32x4ExtMulHighI16x8S:
227227
case kX64I32x4ExtMulLowI16x8U:
228228
case kX64I32x4ExtMulHighI16x8U:
229+
case kX64I32x4ExtAddPairwiseI16x8S:
230+
case kX64I32x4ExtAddPairwiseI16x8U:
229231
case kX64I16x8Splat:
230232
case kX64I16x8ExtractLaneS:
231233
case kX64I16x8SConvertI8x16Low:
@@ -264,6 +266,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
264266
case kX64I16x8ExtMulHighI8x16S:
265267
case kX64I16x8ExtMulLowI8x16U:
266268
case kX64I16x8ExtMulHighI8x16U:
269+
case kX64I16x8ExtAddPairwiseI8x16S:
270+
case kX64I16x8ExtAddPairwiseI8x16U:
267271
case kX64I8x16Splat:
268272
case kX64I8x16ExtractLaneS:
269273
case kX64I8x16SConvertI16x8:

src/compiler/backend/x64/instruction-selector-x64.cc

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "src/base/logging.h"
99
#include "src/base/overflowing-math.h"
1010
#include "src/base/platform/wrappers.h"
11+
#include "src/codegen/cpu-features.h"
1112
#include "src/codegen/machine-type.h"
1213
#include "src/compiler/backend/instruction-selector-impl.h"
1314
#include "src/compiler/machine-operator.h"
@@ -3617,6 +3618,37 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) {
36173618
VisitPminOrPmax(this, node, kX64F64x2Pmax);
36183619
}
36193620

3621+
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
3622+
X64OperandGenerator g(this);
3623+
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
3624+
? g.DefineAsRegister(node)
3625+
: g.DefineSameAsFirst(node);
3626+
Emit(kX64I32x4ExtAddPairwiseI16x8S, dst, g.UseRegister(node->InputAt(0)));
3627+
}
3628+
3629+
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8U(Node* node) {
3630+
X64OperandGenerator g(this);
3631+
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
3632+
? g.DefineAsRegister(node)
3633+
: g.DefineSameAsFirst(node);
3634+
Emit(kX64I32x4ExtAddPairwiseI16x8U, dst, g.UseRegister(node->InputAt(0)));
3635+
}
3636+
3637+
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
3638+
X64OperandGenerator g(this);
3639+
// Codegen depends on dst != src.
3640+
Emit(kX64I16x8ExtAddPairwiseI8x16S, g.DefineAsRegister(node),
3641+
g.UseUniqueRegister(node->InputAt(0)));
3642+
}
3643+
3644+
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
3645+
X64OperandGenerator g(this);
3646+
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
3647+
? g.DefineAsRegister(node)
3648+
: g.DefineSameAsFirst(node);
3649+
Emit(kX64I16x8ExtAddPairwiseI8x16U, dst, g.UseRegister(node->InputAt(0)));
3650+
}
3651+
36203652
// static
36213653
MachineOperatorBuilder::Flags
36223654
InstructionSelector::SupportedMachineOperatorFlags() {

test/cctest/wasm/test-run-wasm-simd.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1882,7 +1882,7 @@ WASM_SIMD_TEST(S128Not) {
18821882
[](int32_t x) { return ~x; });
18831883
}
18841884

1885-
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
1885+
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
18861886
// TODO(v8:11086) Prototype i32x4.extadd_pairwise_i16x8_{s,u}
18871887
template <typename Narrow, typename Wide>
18881888
void RunExtAddPairwiseTest(TestExecutionTier execution_tier,
@@ -1931,7 +1931,7 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8ExtAddPairwiseI8x16U) {
19311931
kExprI16x8ExtAddPairwiseI8x16U,
19321932
kExprI8x16Splat);
19331933
}
1934-
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
1934+
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
19351935

19361936
void RunI32x4BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
19371937
WasmOpcode opcode, Int32BinOp expected_op) {

0 commit comments

Comments
 (0)