[TOSA] Add legalization for avg_pool with count_include_pad=True (#4273)

Lallapallooza · web-flow · commit bc657db63ad4 · 2025-11-17T09:31:05.000-08:00
Before this patch, the `avg_pool2d` and `avg_pool1d` legalizations
lacked support for pooling with count_include_pad=True. This patch
introduces that support.

---------

Signed-off-by: Vitalii Shutov &lt;vitalii.shutov@arm.com&gt;
diff --git a/include/torch-mlir/Conversion/TorchToTosa/TosaLegalizeUtils.h b/include/torch-mlir/Conversion/TorchToTosa/TosaLegalizeUtils.h
@@ -107,6 +107,13 @@ FailureOr<Value> getConvBiasForNoneType(Operation *op,
                                         Type inputElemTy, Type outputElemTy,
                                         ArrayRef<int64_t> weightShape);
 
+// Emit an explicit zero-valued `tosa.pad` around an NHWC tensor so that later
+// avg_pool lowering can run with `pad = 0`. `padExtents` is ordered as
+// {top, bottom, left, right}. Returns the padded tensor value.
+Value emitExplicitZeroPadNHWC(Location loc, PatternRewriter &rewriter,
+                              Operation *op, Value inputNHWC,
+                              ArrayRef<int64_t> padExtents);
+
 } // namespace tosa
 } // namespace mlir
 
diff --git a/lib/Conversion/TorchToTosa/TorchToTosa.cpp b/lib/Conversion/TorchToTosa/TorchToTosa.cpp
@@ -6237,7 +6237,7 @@ static LogicalResult getOutputTypeAndPoolingParameters(
     AtenOpT op, ConversionPatternRewriter &rewriter, Value &inputXchw,
     SmallVectorImpl<int64_t> &dilationArray, Type &outputTy,
     DenseI64ArrayAttr &kernel, DenseI64ArrayAttr &stride,
-    DenseI64ArrayAttr &pad) {
+    DenseI64ArrayAttr &pad, SmallVectorImpl<int64_t> &explicitNHWCPad) {
 
   RankedTensorType inputTy = cast<RankedTensorType>(inputXchw.getType());
   if (!inputTy)
@@ -6277,21 +6277,39 @@ static LogicalResult getOutputTypeAndPoolingParameters(
 
   if constexpr (std::is_same<AtenOpT, AtenAvgPool1dOp>() ||
                 std::is_same<AtenOpT, AtenAvgPool2dOp>()) {
-    // Currently, we can not represent `count_include_pad` with the existing
-    // TOSA AvgPool2d specification. Without the below check, we produce silent
-    // wrong answer (SWA) when the `count_include_pad` value is `true.`
-    //
-    // Note: We need to check for `count_include_pad` only when the `padding`
-    // value is non-zero.
+    // When count_include_pad=true with non-zero padding, we will materialize an
+    // explicit pad after transposing to NHWC. Track the padding extents and
+    // zero out the TOSA op padding so the divisor matches the full kernel size.
     bool countIncludePad;
     if ((paddingInts[0] != 0 || paddingInts[1] != 0) &&
         (!matchPattern(op.getCountIncludePad(),
                        m_TorchConstantBool(&countIncludePad)) ||
 
          countIncludePad)) {
-      return rewriter.notifyMatchFailure(
-          op, "Unsupported `count_include_pad` value, for tosa AvgPool "
-              "`count_include_pad` value should be `False`.");
+      // Remember the spatial padding so we can emit an NHWC tosa.pad right
+      // after the transpose.
+      explicitNHWCPad.assign(
+          {paddingInts[0], paddingInts[0], paddingInts[1], paddingInts[1]});
+
+      auto addPad = [](int64_t dim, int64_t before, int64_t after) -> int64_t {
+        if (ShapedType::isDynamic(dim))
+          return ShapedType::kDynamic;
+        return dim + before + after;
+      };
+
+      // Update the logical input type used for shape computations to include
+      // the extra zeros supplied by the explicit pad.
+      SmallVector<int64_t> paddedShape(inputTy.getShape().begin(),
+                                       inputTy.getShape().end());
+      // Height stored at rank-2 and width at rank-1 while the tensor is still
+      // in NCHW order; the NHWC transpose happens later.
+      paddedShape[inputRank - 2] =
+          addPad(paddedShape[inputRank - 2], paddingInts[0], paddingInts[0]);
+      paddedShape[inputRank - 1] =
+          addPad(paddedShape[inputRank - 1], paddingInts[1], paddingInts[1]);
+      inputTy = RankedTensorType::get(paddedShape, inputTy.getElementType());
+
+      paddingInts.assign(/*Count=*/2, /*Value=*/0);
     }
   }
 
@@ -6314,6 +6332,18 @@ static LogicalResult getOutputTypeAndPoolingParameters(
   return success();
 }
 
+template <typename AtenOpT, typename tosaOp>
+static LogicalResult getOutputTypeAndPoolingParameters(
+    AtenOpT op, ConversionPatternRewriter &rewriter, Value &inputXchw,
+    SmallVectorImpl<int64_t> &dilationArray, Type &outputTy,
+    DenseI64ArrayAttr &kernel, DenseI64ArrayAttr &stride,
+    DenseI64ArrayAttr &pad) {
+  SmallVector<int64_t, 4> ignoredExplicitPad;
+  return getOutputTypeAndPoolingParameters<AtenOpT, tosaOp>(
+      op, rewriter, inputXchw, dilationArray, outputTy, kernel, stride, pad,
+      ignoredExplicitPad);
+}
+
 class ConvertAtenMaxPool2dOp
     : public ConvertAtenPoolingBaseOp<AtenMaxPool2dOp, tosa::MaxPool2dOp> {
 public:
@@ -6435,15 +6465,23 @@ class ConvertAtenAvgPool2dOp
     }
 
     SmallVector<int64_t, 2> dilationArray{1, 1};
+    SmallVector<int64_t, 4> explicitNHWCPad;
     if (failed(getOutputTypeAndPoolingParameters<AtenAvgPool2dOp,
                                                  tosa::AvgPool2dOp>(
-            op, rewriter, self, dilationArray, outputTy, kernel, stride, pad)))
+            op, rewriter, self, dilationArray, outputTy, kernel, stride, pad,
+            explicitNHWCPad)))
       return rewriter.notifyMatchFailure(
           op, "invalid pooling parameters or input type");
 
-    // Transpose to xHWC
-    input = ConvertAtenPoolingBaseOp<AtenAvgPool2dOp, tosa::AvgPool2dOp>::
-        transposePoolingInputToHwc(op, rewriter, self);
+    Value transposed =
+        ConvertAtenPoolingBaseOp<AtenAvgPool2dOp, tosa::AvgPool2dOp>::
+            transposePoolingInputToHwc(op, rewriter, self);
+
+    if (!explicitNHWCPad.empty())
+      transposed = tosa::emitExplicitZeroPadNHWC(op->getLoc(), rewriter, op,
+                                                 transposed, explicitNHWCPad);
+
+    input = transposed;
 
     return success();
   }
@@ -6486,16 +6524,23 @@ class ConvertAtenAvgPool1dOp
             .getResult();
 
     SmallVector<int64_t, 2> dilationArray{1, 1};
+    SmallVector<int64_t, 4> explicitNHWCPad;
     if (failed(getOutputTypeAndPoolingParameters<AtenAvgPool1dOp,
                                                  tosa::AvgPool2dOp>(
             op, rewriter, reshapedSelf, dilationArray, outputTy, kernel, stride,
-            pad)))
+            pad, explicitNHWCPad)))
       return rewriter.notifyMatchFailure(
           op, "invalid pooling parameters or input type");
 
-    // Transpose to xHWC
-    input = ConvertAtenPoolingBaseOp<AtenAvgPool1dOp, tosa::AvgPool2dOp>::
-        transposePoolingInputToHwc(op, rewriter, reshapedSelf);
+    Value transposed =
+        ConvertAtenPoolingBaseOp<AtenAvgPool1dOp, tosa::AvgPool2dOp>::
+            transposePoolingInputToHwc(op, rewriter, reshapedSelf);
+
+    if (!explicitNHWCPad.empty())
+      transposed = tosa::emitExplicitZeroPadNHWC(op->getLoc(), rewriter, op,
+                                                 transposed, explicitNHWCPad);
+
+    input = transposed;
 
     return success();
   }
diff --git a/lib/Conversion/TorchToTosa/TosaLegalizeUtils.cpp b/lib/Conversion/TorchToTosa/TosaLegalizeUtils.cpp
@@ -624,5 +624,45 @@ FailureOr<Value> getConvBiasForNoneType(Operation *op,
   }
 }
 
+Value emitExplicitZeroPadNHWC(Location loc, PatternRewriter &rewriter,
+                              Operation *op, Value inputNHWC,
+                              ArrayRef<int64_t> padExtents) {
+  assert(padExtents.size() == 4 && "expected [top, bottom, left, right]");
+
+  if (llvm::all_of(padExtents, [](int64_t v) { return v == 0; }))
+    return inputNHWC;
+
+  SmallVector<int64_t, 8> nhwcPadding = {
+      0, 0, padExtents[0], padExtents[1], padExtents[2], padExtents[3], 0, 0};
+  Value nhwcPadShape = tosa::getTosaConstShape(rewriter, loc, nhwcPadding);
+
+  auto inputTy = dyn_cast<RankedTensorType>(inputNHWC.getType());
+  if (!inputTy)
+    return inputNHWC;
+  SmallVector<int64_t, 4> resultShape(inputTy.getShape().begin(),
+                                      inputTy.getShape().end());
+  auto addPad = [](int64_t dim, int64_t before, int64_t after) -> int64_t {
+    if (ShapedType::isDynamic(dim))
+      return ShapedType::kDynamic;
+    return dim + before + after;
+  };
+  resultShape[1] = addPad(resultShape[1], padExtents[0], padExtents[1]);
+  resultShape[2] = addPad(resultShape[2], padExtents[2], padExtents[3]);
+
+  auto resultTy = RankedTensorType::get(resultShape, inputTy.getElementType());
+
+  Type elemTy = inputTy.getElementType();
+  Value padConst;
+  if (isa<mlir::FloatType>(elemTy)) {
+    padConst = *getConstTensor<float>(rewriter, op, {0.0f}, {1}, elemTy);
+  } else {
+    padConst = *getConstTensor<int32_t>(rewriter, op, {0}, {1}, elemTy);
+  }
+
+  return tosa::PadOp::create(rewriter, loc, resultTy, inputNHWC, nhwcPadShape,
+                             padConst)
+      .getResult();
+}
+
 } // namespace tosa
 } // namespace mlir
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
@@ -3533,7 +3533,6 @@
     "AtenSymConstrainRangeForSize_basic",
     "AtenSymConstrainRange_basic",
     "Aten_AssertScalar_basic",
-    "AvgPool2dSingleIntTupleParamsIncludePadModule_basic",
     "ScatterAddDynamicModule_basic",
     "UniformModule_basic",
     "UniformStaticShapeModule_basic",
@@ -3655,21 +3654,14 @@
     "AtenTopKModule_basic",
     "AtenTopKSmallestModule_basic",
     "Aten_EmbeddingBagExample_basic",
-    "AvgPool1dFloatModule_basic",
     "AvgPool1dIntModule_basic",
     "AvgPool1dStaticModule_basic",
-    "AvgPool2dCeilModeTrueModule_basic",
     "AvgPool1dNoPadCeilPadNotIncluded_basic",
     "AvgPool1dPadCeilPadNotIncluded_basic",
-    "AvgPool2dCeilPaddingStridedIncludePadding_basic",
-    "AvgPool2dCeilPaddingUnitaryStrideIncludePadding_basic",
-    "AvgPool2dFloorPaddingUnitaryStrideIncludePadding_basic",
     "AvgPool3dDiffKernelsStridesNoPadCeilPadNotIncluded_basic",
     "AvgPool3dDiffKernelsStridesPadCeilPadNotIncluded_basic",
     "AvgPool2dDivisorOverrideModule_basic",
-    "AvgPool2dFloatModule_basic",
     "AvgPool2dIntModule_basic",
-    "AvgPool2dStaticModule_basic",
     "BernoulliFloatModule_basic",
     "BernoulliPModule_basic",
     "BernoulliTensorModule_basic",
diff --git a/test/Conversion/TorchToTosa/basic.mlir b/test/Conversion/TorchToTosa/basic.mlir