Use dot products for sums. (#5954)

dsharletg · web-flow · commit 94c0ecabd977 · 2021-05-03T18:17:39.000-06:00
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -1143,12 +1143,18 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
         Expr pattern;
         const char *intrin;
         Target::Feature required_feature;
+        std::vector<int> extra_operands;
     };
     // clang-format off
     static const Pattern patterns[] = {
         {VectorReduce::Add, 4, i32(widening_mul(wild_i8x_, wild_i8x_)), "dot_product", Target::ARMDotProd},
         {VectorReduce::Add, 4, i32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
         {VectorReduce::Add, 4, u32(widening_mul(wild_u8x_, wild_u8x_)), "dot_product", Target::ARMDotProd},
+        // A sum is the same as a dot product with a vector of ones, and this appears to
+        // be a bit faster.
+        {VectorReduce::Add, 4, i32(wild_i8x_), "dot_product", Target::ARMDotProd, {1}},
+        {VectorReduce::Add, 4, i32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}},
+        {VectorReduce::Add, 4, u32(wild_u8x_), "dot_product", Target::ARMDotProd, {1}},
     };
     // clang-format on
 
@@ -1162,13 +1168,17 @@ void CodeGen_ARM::codegen_vector_reduce(const VectorReduce *op, const Expr &init
             continue;
         }
         if (expr_match(p.pattern, op->value, matches)) {
-            if (factor != 4) {
-                Expr equiv = VectorReduce::make(op->op, op->value, op->value.type().lanes() / 4);
+            if (factor != p.factor) {
+                Expr equiv = VectorReduce::make(op->op, op->value, op->value.type().lanes() / p.factor);
                 equiv = VectorReduce::make(op->op, equiv, op->type.lanes());
                 codegen_vector_reduce(equiv.as<VectorReduce>(), init);
                 return;
             }
 
+            for (int i : p.extra_operands) {
+                matches.push_back(make_const(matches[0].type(), i));
+            }
+
             Expr i = init;
             if (!i.defined()) {
                 i = make_zero(op->type);
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
@@ -1110,6 +1110,14 @@ class SimdOpCheck : public SimdOpCheckTest {
                         for (int v : {2, 4}) {
                             check("udot", v, sum(u32(in_u8(f * x + r)) * in_u8(f * x + r + 32)));
                             check("sdot", v, sum(i32(in_i8(f * x + r)) * in_i8(f * x + r + 32)));
+                            if (f == 4) {
+                                // This doesn't generate for higher reduction factors because the
+                                // intermediate is 16-bit instead of 32-bit. It seems like it would
+                                // be slower to fix this (because the intermediate sum would be
+                                // 32-bit instead of 16-bit).
+                                check("udot", v, sum(u32(in_u8(f * x + r))));
+                                check("sdot", v, sum(i32(in_i8(f * x + r))));
+                            }
                         }
                     }
                 }

Original file line number	Diff line number	Diff line change
`@@ -1110,6 +1110,14 @@ class SimdOpCheck : public SimdOpCheckTest {`
`1110`	`1110`	`for (int v : {2, 4}) {`
`1111`	`1111`	`check("udot", v, sum(u32(in_u8(f * x + r)) * in_u8(f * x + r + 32)));`
`1112`	`1112`	`check("sdot", v, sum(i32(in_i8(f * x + r)) * in_i8(f * x + r + 32)));`
	`1113`	`+ if (f == 4) {`
	`1114`	`+ // This doesn't generate for higher reduction factors because the`
	`1115`	`+ // intermediate is 16-bit instead of 32-bit. It seems like it would`
	`1116`	`+ // be slower to fix this (because the intermediate sum would be`
	`1117`	`+ // 32-bit instead of 16-bit).`
	`1118`	`+ check("udot", v, sum(u32(in_u8(f * x + r))));`
	`1119`	`+ check("sdot", v, sum(i32(in_i8(f * x + r))));`
	`1120`	`+ }`
`1113`	`1121`	`}`
`1114`	`1122`	`}`
`1115`	`1123`	`}`