Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1143,12 +1143,18 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
Expr pattern;
const char *intrin;
Target::Feature required_feature;
std::vector<int> extra_operands;
};
// clang-format off
static const Pattern patterns[] = {
{VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::ARMDotProd},
{VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
{VectorReduce::Add, 4, u32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
// A sum is the same as a dot product with a vector of ones, and this appears to
// be a bit faster.
{VectorReduce::Add, 4, i32(wild_i8x_), "dot_product", Target::ARMDotProd, {1}},
{VectorReduce::Add, 4, i32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}},
{VectorReduce::Add, 4, u32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}},
};
// clang-format on

Expand All @@ -1162,13 +1168,17 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
continue;
}
if (expr_match(p.pattern, op->value, matches)) {
if (factor != 4) {
Expr equiv = VectorReduce::make(op->op, op->value, op->value.type().lanes() / 4);
if (factor != p.factor) {
Expr equiv = VectorReduce::make(op->op, op->value, op->value.type().lanes() / p.factor);
equiv = VectorReduce::make(op->op, equiv, op->type.lanes());
codegen_vector_reduce(equiv.as<VectorReduce>(), init);
return;
}

for (int i : p.extra_operands) {
matches.push_back(make_const(matches[0].type(), i));
}

Expr i = init;
if (!i.defined()) {
i = make_zero(op->type);
Expand Down
8 changes: 8 additions & 0 deletions test/correctness/simd_op_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,14 @@ class SimdOpCheck : public SimdOpCheckTest {
for (int v : {2, 4}) {
check("udot", v, sum(u32(in_u8(f * x + r)) * in_u8(f * x + r + 32)));
check("sdot", v, sum(i32(in_i8(f * x + r)) * in_i8(f * x + r + 32)));
if (f == 4) {
// This doesn't generate for higher reduction factors because the
// intermediate is 16-bit instead of 32-bit. It seems like it would
// be slower to fix this (because the intermediate sum would be
// 32-bit instead of 16-bit).
check("udot", v, sum(u32(in_u8(f * x + r))));
check("sdot", v, sum(i32(in_i8(f * x + r))));
}
}
}
}
Expand Down