[TOSA] Add cumsum legalization (#4402)

Lallapallooza · web-flow · commit 0b00ae85dcea · 2025-12-15T07:55:21.000-08:00
diff --git a/lib/Conversion/TorchToTosa/TorchToTosa.cpp b/lib/Conversion/TorchToTosa/TorchToTosa.cpp
@@ -48,6 +48,50 @@ namespace mlir::torch {
 
 namespace {
 
+// Runs an in-place inclusive prefix sum along the middle dimension (K) of
+// `running` using a binary lifting scheme. The input must have shape [N, K, C].
+// After the loop, `running` holds the cumsum result with respect to axis=1.
+static Value emitInclusiveScanByPowersOfTwo(Value running,
+                                            ConversionPatternRewriter &rewriter,
+                                            Location loc) {
+  auto nkcTy = cast<RankedTensorType>(running.getType());
+  SmallVector<int64_t> nkcShape(makeShapeTorchCompatible(nkcTy.getShape()));
+  int64_t outer = nkcShape[0];
+  int64_t dimSize = nkcShape[1];
+  int64_t inner = nkcShape[2];
+
+  auto zeroConstOr =
+      tosa::createZeroPointTensor(rewriter, loc, nkcTy.getElementType(), 0);
+  if (!zeroConstOr)
+    return nullptr;
+  Value zeroConst = *zeroConstOr;
+
+  SmallVector<int64_t, 3> sliceStart(3, 0);
+  SmallVector<int64_t, 3> sliceSize = {outer, dimSize, inner};
+
+  for (int64_t offset = 1; offset < dimSize; offset <<= 1) {
+    SmallVector<int64_t, 6> padSpec = {0, 0, offset, 0, 0, 0};
+    auto padShape = tosa::getTosaConstShape(rewriter, loc, padSpec);
+    SmallVector<int64_t> paddedShape = {outer, dimSize + offset, inner};
+    auto paddedTy = RankedTensorType::get(makeShapeLLVMCompatible(paddedShape),
+                                          nkcTy.getElementType());
+    Value padded = tosa::PadOp::create(rewriter, loc, paddedTy, running,
+                                       padShape, zeroConst)
+                       .getResult();
+
+    Value shifted = tosa::SliceOp::create(
+                        rewriter, loc, nkcTy, padded,
+                        tosa::getTosaConstShape(rewriter, loc, sliceStart),
+                        tosa::getTosaConstShape(rewriter, loc, sliceSize))
+                        .getResult();
+
+    running =
+        tosa::AddOp::create(rewriter, loc, nkcTy, running, shifted).getResult();
+  }
+
+  return running;
+}
+
 static SmallVector<int64_t> permuteShape(ArrayRef<int64_t> originalShape,
                                          ArrayRef<int32_t> permutation) {
   SmallVector<int64_t> result;
@@ -4574,7 +4618,7 @@ LogicalResult ConvertAtenOp<AtenSliceTensorOp>::matchAndRewrite(
     return rewriter.notifyMatchFailure(op, "dim out of range");
 
   SmallVector<int64_t> inputShape =
-      llvm::to_vector(makeShapeTorchCompatible(selfType.getShape()));
+      makeShapeTorchCompatible(selfType.getShape());
   const int64_t K = inputShape[dim];
 
   int64_t start;
@@ -9617,6 +9661,77 @@ LogicalResult ConvertAtenOp<AtenUnfoldOp>::matchAndRewrite(
   return success();
 }
 
+// Legalization for aten.cumsum
+template <>
+LogicalResult ConvertAtenOp<AtenCumsumOp>::matchAndRewrite(
+    AtenCumsumOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  auto self = adaptor.getSelf();
+  auto selfType = dyn_cast<RankedTensorType>(self.getType());
+  if (!selfType || !selfType.hasStaticShape())
+    return rewriter.notifyMatchFailure(op,
+                                       "Only static tensor shapes supported");
+
+  auto loc = op->getLoc();
+
+  int64_t dim;
+  if (!matchPattern(op.getDim(), m_TorchConstantInt(&dim)))
+    return rewriter.notifyMatchFailure(op, "dim must be constant");
+  dim = toPositiveDim(dim, selfType.getRank());
+  if (!isValidDim(dim, selfType.getRank()))
+    return rewriter.notifyMatchFailure(op, "dim out of range");
+
+  auto outTypeAny = getTypeConverter()->convertType(op.getType());
+  auto outType = dyn_cast<RankedTensorType>(outTypeAny);
+  if (!outType)
+    return rewriter.notifyMatchFailure(op, "expected ranked result type");
+
+  auto outElemTy = outType.getElementType();
+  auto castTy = RankedTensorType::get(selfType.getShape(), outElemTy);
+  Value selfCast = self;
+  if (selfType.getElementType() != outElemTy) {
+    auto maybeCast = tosa::tosaCastTensorToType(rewriter, self, castTy);
+    if (!maybeCast)
+      return rewriter.notifyMatchFailure(op, "failed to cast tensor to dtype");
+    selfCast = *maybeCast;
+  }
+
+  SmallVector<int64_t> inputShape =
+      makeShapeTorchCompatible(selfType.getShape());
+  int64_t dimSize = inputShape[dim];
+
+  int64_t outer = 1;
+  for (int64_t i = 0; i < dim; ++i)
+    outer *= inputShape[i];
+  int64_t inner = 1;
+  for (int64_t i = dim + 1, e = inputShape.size(); i < e; ++i)
+    inner *= inputShape[i];
+
+  // Collapse the tensor to [outer, dimSize, inner] so the scanned dimension
+  // is isolated. `outer` is the product of all dims before `dim`, and `inner`
+  // is the product after `dim`. This lets us run a simple binary lifting
+  // prefix-sum in 3D regardless of the original rank.
+  SmallVector<int64_t> nkcShape = {outer, dimSize, inner};
+  auto nkcTy =
+      RankedTensorType::get(makeShapeLLVMCompatible(nkcShape), outElemTy);
+
+  Value running =
+      tosa::ReshapeOp::create(rewriter, loc, nkcTy, selfCast,
+                              tosa::getTosaConstShape(rewriter, loc, nkcShape))
+          .getResult();
+
+  // Accumulate in-place: `running` always has shape [outer, dimSize, inner].
+  running = emitInclusiveScanByPowersOfTwo(running, rewriter, loc);
+
+  auto finalShape = outType.getShape();
+  auto result = tosa::ReshapeOp::create(
+      rewriter, loc, outType, running,
+      tosa::getTosaConstShape(rewriter, loc, finalShape));
+
+  rewriter.replaceOp(op, result.getResult());
+  return success();
+}
+
 template <typename OpTy>
 class ConvertCastEquivalentOp : public OpConversionPattern<OpTy> {
   using OpConversionPattern<OpTy>::OpConversionPattern;
@@ -10242,6 +10357,7 @@ std::set<StringRef> populateTorchToTosaConversionPatternsAndIllegalOps(
   INSERT_ATENOP_PATTERN(AtenExpm1Op);
   INSERT_ATENOP_PATTERN(AtenTanOp);
   INSERT_ATENOP_PATTERN(AtenUnfoldOp);
+  INSERT_ATENOP_PATTERN(AtenCumsumOp);
   INSERT_ATENOP_PATTERN(AtenQuantizePerTensorOp);
 #undef INSERT_ATENOP_PATTERN
 
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
@@ -3618,7 +3618,6 @@
     "Conv_Transpose1dModule_basic",
     "Conv_Transpose1dStaticModule_basic",
     "IndexPutWithNoneAndBroadcastModule_basic",
-    "MaskedScatterStaticBasic_basic",
     "MaxUnpool3dModulePad0_basic",
     "MaxUnpool3dModule_basic",
     "MaxUnpool2dModule_basic",
@@ -3729,11 +3728,7 @@
     "ConvolutionModule3DGroups_basic",
     "ConvolutionModule3DGroupsStrided_basic",
     "ConvolutionModule3DGroupsDilated_basic",
-    "CumsumInputDtypeInt32Module_basic",
-    "CumsumWithDtypeModule_basic",
     "CumsumModule_basic",
-    "CumsumStaticModule_basic",
-    "CumsumStaticNegativeDimModule_basic",
     "CumprodModule_basic",
     "CumprodInputDtypeInt32Module_basic",
     "CumprodStaticModule_basic",
@@ -3812,10 +3807,6 @@
     "LinalgNormKeepDimComplexModule_basic",
     "LinalgVectorNormComplexModule_basic",
     "LinspaceEmptyModule_basic",
-    "LogCumsumExpModule_basic",
-    "LogCumsumExpStaticNegativeDimModule_basic",
-    "LogCumsumExpStaticFloat64DtypeModule_basic",
-    "MaskedScatterStaticBasic_basic",
     "MaxPool1dWithIndicesModule_basic",
     "MaxPool1dWithIndicesCeilModeModule_basic",
     "MaxPool1dCeilModeTrueModule_basic",
diff --git a/test/Conversion/TorchToTosa/basic.mlir b/test/Conversion/TorchToTosa/basic.mlir
@@ -4439,6 +4439,51 @@ func.func @torch.aten.linear$f16(%arg0: !torch.vtensor<[2,4],f16>, %arg1: !torch
   return %0 : !torch.vtensor<[2,3],f16>
 }
 
+// CHECK-LABEL:   func.func @torch.aten.cumsum.basic(
+// CHECK-SAME:                                       %[[ARG:.*]]: !torch.vtensor<[2,3],f32>) -> !torch.vtensor<[2,3],f32> {
+// CHECK:           %[[IN:.*]] = torch_c.to_builtin_tensor %[[ARG]] : !torch.vtensor<[2,3],f32> -> tensor<2x3xf32>
+// CHECK:           %[[RESHAPE_SHAPE:.*]] = tosa.const_shape  {values = dense<[2, 3, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[RESHAPED:.*]] = tosa.reshape %[[IN]], %[[RESHAPE_SHAPE]] : (tensor<2x3xf32>, !tosa.shape<3>) -> tensor<2x3x1xf32>
+// CHECK:           %[[ZERO:.*]] = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32>
+// CHECK:           %[[PAD_SPEC:.*]] = tosa.const_shape  {values = dense<[0, 0, 1, 0, 0, 0]> : tensor<6xindex>} : () -> !tosa.shape<6>
+// CHECK:           %[[PADDED:.*]] = tosa.pad %[[RESHAPED]], %[[PAD_SPEC]], %[[ZERO]] : (tensor<2x3x1xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<2x4x1xf32>
+// CHECK:           %[[SLICE_START:.*]] = tosa.const_shape  {values = dense<0> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[SLICE_SIZE:.*]] = tosa.const_shape  {values = dense<[2, 3, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
+// CHECK:           %[[SLICE:.*]] = tosa.slice %[[PADDED]], %[[SLICE_START]], %[[SLICE_SIZE]] : (tensor<2x4x1xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<2x3x1xf32>
+// CHECK:           %[[ACC1:.*]] = tosa.add %[[RESHAPED]], %[[SLICE]] : (tensor<2x3x1xf32>, tensor<2x3x1xf32>) -> tensor<2x3x1xf32>
+// CHECK:           %[[ACC2:.*]] = tosa.add %[[ACC1]], %{{.*}} : (tensor<2x3x1xf32>, tensor<2x3x1xf32>) -> tensor<2x3x1xf32>
+// CHECK:           %[[FINAL:.*]] = tosa.const_shape  {values = dense<[2, 3]> : tensor<2xindex>} : () -> !tosa.shape<2>
+// CHECK:           %[[OUT:.*]] = tosa.reshape %[[ACC2]], %[[FINAL]] : (tensor<2x3x1xf32>, !tosa.shape<2>) -> tensor<2x3xf32>
+// CHECK:           %[[TORCH:.*]] = torch_c.from_builtin_tensor %[[OUT]] : tensor<2x3xf32> -> !torch.vtensor<[2,3],f32>
+// CHECK:           return %[[TORCH]] : !torch.vtensor<[2,3],f32>
+// CHECK:         }
+func.func @torch.aten.cumsum.basic(%arg0: !torch.vtensor<[2,3],f32>) -> !torch.vtensor<[2,3],f32> {
+  %dim = torch.constant.int 1
+  %none = torch.constant.none
+  %0 = torch.aten.cumsum %arg0, %dim, %none : !torch.vtensor<[2,3],f32>, !torch.int, !torch.none -> !torch.vtensor<[2,3],f32>
+  return %0 : !torch.vtensor<[2,3],f32>
+}
+
+// CHECK-LABEL:   func.func @torch.aten.cumsum.si32(
+// CHECK-SAME:                                        %[[ARG:.*]]: !torch.vtensor<[3,2],si32>) -> !torch.vtensor<[3,2],si32> {
+// CHECK:           %[[IN:.*]] = torch_c.to_builtin_tensor %[[ARG]] : !torch.vtensor<[3,2],si32> -> tensor<3x2xi32>
+// CHECK:           %[[RESHAPE:.*]] = tosa.reshape %[[IN]], %{{.*}} : (tensor<3x2xi32>, !tosa.shape<3>) -> tensor<{{.*}}xi32>
+// CHECK:           %[[ZERO:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK:           %[[PAD:.*]] = tosa.pad %[[RESHAPE]], %{{.*}}, %[[ZERO]] : (tensor<{{.*}}xi32>, !tosa.shape<6>, tensor<1xi32>) -> tensor<{{.*}}xi32>
+// CHECK:           %[[SLICE:.*]] = tosa.slice %[[PAD]], %{{.*}}, %{{.*}} : (tensor<{{.*}}xi32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<{{.*}}xi32>
+// CHECK:           %[[ACC1:.*]] = tosa.add %[[RESHAPE]], %[[SLICE]] : (tensor<{{.*}}xi32>, tensor<{{.*}}xi32>) -> tensor<{{.*}}xi32>
+// CHECK:           %[[ACC2:.*]] = tosa.add %[[ACC1]], %{{.*}} : (tensor<{{.*}}xi32>, tensor<{{.*}}xi32>) -> tensor<{{.*}}xi32>
+// CHECK:           %[[FINAL:.*]] = tosa.reshape %[[ACC2]], %{{.*}} : (tensor<{{.*}}xi32>, !tosa.shape<2>) -> tensor<3x2xi32>
+// CHECK:           %[[TORCH:.*]] = torch_c.from_builtin_tensor %[[FINAL]] : tensor<3x2xi32> -> !torch.vtensor<[3,2],si32>
+// CHECK:           return %[[TORCH]] : !torch.vtensor<[3,2],si32>
+// CHECK:         }
+func.func @torch.aten.cumsum.si32(%arg0: !torch.vtensor<[3,2],si32>) -> !torch.vtensor<[3,2],si32> {
+  %dim = torch.constant.int 0
+  %none = torch.constant.none
+  %0 = torch.aten.cumsum %arg0, %dim, %none : !torch.vtensor<[3,2],si32>, !torch.int, !torch.none -> !torch.vtensor<[3,2],si32>
+  return %0 : !torch.vtensor<[3,2],si32>
+}
+
 // -----
 func.func @torch.aten.empty.memory_format() -> !torch.vtensor<[1,0,256],f32>{
     %c1 = torch.constant.int 1