pytorch · cyyever · Jul 15, 2025
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
@@ -447,6 +447,7 @@ void nbit_embedding_sanity_check(
   assert(
       (input_bit_rate == 2 || input_bit_rate == 4) &&
       "input_bit_rate must be 2 or 4");
+  // NOLINTNEXTLINE(bugprone-branch-clone)
   if constexpr (std::is_same_v<OutType, uint8_t>) {
     assert(
         (no_bag && input_bit_rate == 4 && output_bit_rate == 4) &&

diff --git a/src/EmbeddingSpMDMNBit.cc b/src/EmbeddingSpMDMNBit.cc
@@ -825,9 +825,7 @@ GenEmbeddingSpMDMNBitLookup<
                   a->vextracti32x4(temp_vreg.xmm(), src_vreg, asmjit::Imm(i));
                   a->vpmovsxbd(temp_vreg, temp_vreg.xmm());
                 } else {
-                  if (i == 1) {
-                    a->vpsrldq(src_vreg, src_vreg, asmjit::Imm(8));
-                  } else if (i == 2) {
+                  if (i == 2) {
                     a->vextractf128(
                         src_vreg.xmm(), temp2_vreg.ymm(), asmjit::Imm(i >> 1));
                   } else {

diff --git a/src/FbgemmConv.cc b/src/FbgemmConv.cc
@@ -149,8 +149,8 @@ int fbgemmConv(
             "For depthwise, only requantized output is supported");
 
         if constexpr (
-            processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
-          depthwise_3d_same_pad<QuantizationGranularity::TENSOR>(
+            processOutputType::QGRANType == QuantizationGranularity::TENSOR ||  processOutputType::QGRANType == QuantizationGranularity::GROUP || processOutputType::QGRANType ==  QuantizationGranularity::OUT_CHANNEL) {
+          depthwise_3d_same_pad<processOutputType::QGRANType>(
               *reinterpret_cast<const conv_param_t<3>*>(&conv_p),
               outProcess.getAZeroPoint(),
               activations,
@@ -165,41 +165,6 @@ int fbgemmConv(
               act_times_w_scale,
               thread_id,
               num_threads);
-        } else if constexpr (
-            processOutputType::QGRANType == QuantizationGranularity::GROUP) {
-          depthwise_3d_same_pad<QuantizationGranularity::GROUP>(
-              *reinterpret_cast<const conv_param_t<3>*>(&conv_p),
-              outProcess.getAZeroPoint(),
-              activations,
-              B_zero_point,
-              *(packed_weights.getPackedWForDepthwise()),
-              C_multiplier,
-              outProcess.getCZeroPoint(),
-              out,
-              outProcess.getColOffsets(),
-              outProcess.getBias(),
-              outProcess.RELU_FUSED, // fuse_relu
-              act_times_w_scale, // act_scale * weight_scale
-              thread_id,
-              num_threads);
-        } else if constexpr (
-            processOutputType::QGRANType ==
-            QuantizationGranularity::OUT_CHANNEL) {
-          depthwise_3d_same_pad<QuantizationGranularity::OUT_CHANNEL>(
-              *reinterpret_cast<const conv_param_t<3>*>(&conv_p),
-              outProcess.getAZeroPoint(),
-              activations,
-              B_zero_point,
-              *(packed_weights.getPackedWForDepthwise()),
-              C_multiplier,
-              outProcess.getCZeroPoint(),
-              out,
-              outProcess.getColOffsets(),
-              outProcess.getBias(),
-              outProcess.RELU_FUSED, // fuse_relu
-              act_times_w_scale, // act_scale * weight_scale
-              thread_id,
-              num_threads);
         } else {
           std::string msg =
               "[FBGEMM_CONV_ERROR] This quantization granularity is "
@@ -208,8 +173,8 @@ int fbgemmConv(
         }
       } else if constexpr (SPATIAL_DIM == 2) {
         if constexpr (
-            processOutputType::QGRANType == QuantizationGranularity::TENSOR) {
-          depthwise_2d_same_pad<QuantizationGranularity::TENSOR>(
+            processOutputType::QGRANType == QuantizationGranularity::TENSOR ||  processOutputType::QGRANType == QuantizationGranularity::GROUP || processOutputType::QGRANType ==  QuantizationGranularity::OUT_CHANNEL) {
+          depthwise_2d_same_pad<processOutputType::QGRANType>(
               conv_p.MB, // mini batch
               conv_p.IN_DIM[0], // H
               conv_p.IN_DIM[1], // W
@@ -230,54 +195,6 @@ int fbgemmConv(
               act_times_w_scale,
               thread_id,
               num_threads);
-        } else if constexpr (
-            processOutputType::QGRANType == QuantizationGranularity::GROUP) {
-          depthwise_2d_same_pad<QuantizationGranularity::GROUP>(
-              conv_p.MB, // mini batch
-              conv_p.IN_DIM[0], // H
-              conv_p.IN_DIM[1], // W
-              conv_p.IC, // input channels
-              conv_p.OC, // output channels
-              conv_p.stride[0], // stride_h
-              conv_p.stride[1], // stride_w
-              outProcess.getAZeroPoint(),
-              activations,
-              B_zero_point,
-              *(packed_weights.getPackedWForDepthwise()),
-              C_multiplier,
-              outProcess.getCZeroPoint(),
-              out,
-              outProcess.getColOffsets(),
-              outProcess.getBias(),
-              outProcess.RELU_FUSED, // fuse_relu
-              act_times_w_scale, // act_scale * weight_scale
-              thread_id,
-              num_threads);
-        } else if constexpr (
-            processOutputType::QGRANType ==
-            QuantizationGranularity::OUT_CHANNEL) {
-          // The number of input channels == groups for depthwise convolutions
-          depthwise_2d_same_pad<QuantizationGranularity::OUT_CHANNEL>(
-              conv_p.MB, // mini batch
-              conv_p.IN_DIM[0], // H
-              conv_p.IN_DIM[1], // W
-              conv_p.IC, // input channels
-              conv_p.OC, // output channels
-              conv_p.stride[0], // stride_h
-              conv_p.stride[1], // stride_w
-              outProcess.getAZeroPoint(),
-              activations,
-              B_zero_point,
-              *(packed_weights.getPackedWForDepthwise()),
-              C_multiplier,
-              outProcess.getCZeroPoint(),
-              out,
-              outProcess.getColOffsets(),
-              outProcess.getBias(),
-              outProcess.RELU_FUSED, // fuse_relu
-              act_times_w_scale, // act_scale * weight_scale
-              thread_id,
-              num_threads);
         } else {
           std::string msg =
               "[FBGEMM_CONV_ERROR] This quantization granularity is "

diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc
@@ -1265,8 +1265,6 @@ void requantizeOutputProcessingGConvAvx2(
                     _mm_set1_epi32(r.B_zero_point[quant_param_idx])),
                 _mm_set1_epi32(r.B_zero_point[quant_param_idx + 1]),
                 1);
-          } else if constexpr (C_PER_G == 8) {
-            B_zero_point_v = _mm256_set1_epi32(r.B_zero_point[quant_param_idx]);
           } else {
             B_zero_point_v = _mm256_set1_epi32(r.B_zero_point[quant_param_idx]);
           }
@@ -1343,8 +1341,6 @@ void requantizeOutputProcessingGConvAvx2(
                   _mm_set1_ps(r.C_multiplier[quant_param_idx])),
               _mm_set1_ps(r.C_multiplier[quant_param_idx + 1]),
               1);
-        } else if constexpr (C_PER_G == 8) {
-          multiplier_v = _mm256_set1_ps(r.C_multiplier[quant_param_idx]);
         } else {
           multiplier_v = _mm256_set1_ps(r.C_multiplier[quant_param_idx]);
         }