diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
index 463ef0f9d32..5f4b297cdde 100644
--- a/kernels/aten/functions.yaml
+++ b/kernels/aten/functions.yaml
@@ -403,6 +403,8 @@
 
 - op: unbind_copy.int_out
 
+- op: unfold_copy.out
+
 - op: unsafe_split.Tensor_out
 
 - op: unsqueeze_copy.dim_out
diff --git a/kernels/portable/cpu/op_unfold_copy.cpp b/kernels/portable/cpu/op_unfold_copy.cpp
new file mode 100644
index 00000000000..69ddb3368d7
--- /dev/null
+++ b/kernels/portable/cpu/op_unfold_copy.cpp
@@ -0,0 +1,73 @@
+#include <c10/util/irange.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+#include <cstring>
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+// unfold_copy(Tensor self, int dimension, int size, int step, *, Tensor(a!)
+// out) -> Tensor(a!)
+Tensor unfold_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step,
+    Tensor& out) {
+  (void)ctx;
+  // Check if dimension is valid
+  ET_KERNEL_CHECK(
+      ctx, check_unfold_copy_args(self, dim, size, step), InvalidArgument, out);
+  if (dim < 0) {
+    dim += nonzero_dim(self);
+  }
+  // Calculate output size
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  Tensor::SizesType expected_output_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+
+  get_unfold_copy_out_target_size(
+      self, dim, size, step, expected_output_size, &expected_out_dim);
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, {expected_output_size, expected_out_dim}) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  // Copy data
+  const size_t leading_dims = getLeadingDims(self, dim);
+  const size_t trailing_dims = getTrailingDims(self, dim);
+  ScalarType in_type = self.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, "unfold_copy.out", CTYPE_IN, [&]() {
+    const CTYPE_IN* input_ptr = self.const_data_ptr<CTYPE_IN>();
+    ET_SWITCH_REALHBBF16_TYPES(
+        out_type, ctx, "unfold_copy.out", CTYPE_OUT, [&] {
+          CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
+          for (const auto i : c10::irange(leading_dims)) {
+            const CTYPE_IN* src =
+                input_ptr + i * self.size(dim) * trailing_dims;
+            for (const auto j : c10::irange(out.size(dim))) {
+              const CTYPE_IN* dim_src = src + j * step * trailing_dims;
+              for (const auto k : c10::irange(trailing_dims)) {
+                for (const auto l : c10::irange(size)) {
+                  *out_ptr = convert<CTYPE_OUT, CTYPE_IN>(
+                      dim_src[k + l * trailing_dims]);
+                  out_ptr++;
+                }
+              }
+            }
+          }
+        });
+  });
+  return out;
+}
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
index bd01a1be329..229fba2dad0 100644
--- a/kernels/portable/cpu/util/copy_ops_util.cpp
+++ b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -964,5 +964,46 @@ void get_diagonal_copy_out_target_size(
   out_sizes[in.dim() - 2] = diagonal_size;
 }
 
+bool check_unfold_copy_args(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step) {
+  if (dim < 0) {
+    dim += nonzero_dim(self);
+  }
+  ET_LOG_AND_RETURN_IF_FALSE(tensor_has_dim(self, dim));
+  ET_CHECK_OR_RETURN_FALSE(
+      size >= 0, "size is %" PRId64 " but must be >= 0", size);
+  ET_CHECK_OR_RETURN_FALSE(
+      size <= self.size(dim),
+      "maximum size for tensor at dimension %" PRId64
+      " is %zd but size is %" PRId64,
+      dim,
+      self.size(dim),
+      size);
+  ET_CHECK_OR_RETURN_FALSE(
+      step > 0, "step is %" PRId64 " but must be > 0", step);
+  return true;
+}
+
+void get_unfold_copy_out_target_size(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step,
+    executorch::aten::SizesType* out_sizes,
+    size_t* out_ndim) {
+  for (auto i : c10::irange(self.dim())) {
+    out_sizes[i] = self.size(i);
+  }
+  // At `dim` dimension, we split the tensor into `size` chunks with `step`
+  // stride.
+  out_sizes[dim] = (self.size(dim) - size + step) / step;
+
+  out_sizes[self.dim()] = size;
+  *out_ndim = self.dim() + 1;
+}
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
index e7399ae0956..edcc6eb0021 100644
--- a/kernels/portable/cpu/util/copy_ops_util.h
+++ b/kernels/portable/cpu/util/copy_ops_util.h
@@ -233,5 +233,19 @@ void get_diagonal_copy_out_target_size(
     executorch::aten::SizesType* out_sizes,
     size_t* out_ndim);
 
+bool check_unfold_copy_args(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step);
+
+void get_unfold_copy_out_target_size(
+    const Tensor& self,
+    int64_t dim,
+    int64_t size,
+    int64_t step,
+    executorch::aten::SizesType* out_sizes,
+    size_t* out_ndim);
+
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
index 3221b8fe349..567d062d573 100644
--- a/kernels/portable/functions.yaml
+++ b/kernels/portable/functions.yaml
@@ -917,6 +917,11 @@
     - arg_meta: null
       kernel_name: torch::executor::unbind_copy_int_out
 
+- op: unfold_copy.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::unfold_copy_out
+
 - op: unsqueeze_copy.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/op_unfold_copy_test.cpp b/kernels/test/op_unfold_copy_test.cpp
new file mode 100644
index 00000000000..ef3c09c10e3
--- /dev/null
+++ b/kernels/test/op_unfold_copy_test.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+#include <executorch/kernels/test/TestUtil.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using torch::executor::testing::TensorFactory;
+
+class OpUnfoldTest : public OperatorTest {
+ protected:
+  Tensor& op_unfold_copy_out(
+      const Tensor& self,
+      int64_t dim,
+      int64_t size,
+      int64_t step,
+      Tensor& out) {
+    return torch::executor::aten::unfold_copy_outf(
+        context_, self, dim, size, step, out);
+  }
+
+  template <class CTYPE, ScalarType DTYPE>
+  void test_unfold_copy_dtype() {
+    TensorFactory<DTYPE> tf;
+
+    auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    auto expected = tf.make({3, 2, 2}, {1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9});
+    auto actual_out = tf.zeros_like(expected);
+    op_unfold_copy_out(input, /*dim=*/1, /*size=*/2, /*step=*/1, actual_out);
+    EXPECT_TENSOR_CLOSE(actual_out, expected);
+  }
+};
+
+TEST_F(OpUnfoldTest, SmokeTest) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 1, 2}, {1, 2, 4, 5, 7, 8});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/1, /*size=*/2, /*step=*/2, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, DType) {
+#define TEST_ENTRY(ctype, dtype) \
+  test_unfold_copy_dtype<ctype, ScalarType::dtype>();
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+}
+
+TEST_F(OpUnfoldTest, ZeroDimension) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected =
+      tf.make({2, 3, 2}, {1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/0, /*size=*/2, /*step=*/1, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, NegativeDimension) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 1, 2}, {1, 2, 4, 5, 7, 8});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/-1, /*size=*/2, /*step=*/2, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, LargeStep) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 1, 2}, {1, 2, 4, 5, 7, 8});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/-1, /*size=*/2, /*step=*/5, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, ZeroSize) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  const auto expected = tf.make({3, 4, 0}, {});
+  auto output = tf.zeros_like(expected);
+
+  op_unfold_copy_out(input, /*dim=*/1, /*size=*/0, /*step=*/1, output);
+  EXPECT_TENSOR_CLOSE(output, expected);
+}
+
+TEST_F(OpUnfoldTest, NegativeSizeAndNegativeStepDies) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto output = tf.zeros({3, 1, 2});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/1, /*size=*/-1, /*step=*/1, output));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/1, /*size=*/1, /*step=*/-1, output));
+}
+
+TEST_F(OpUnfoldTest, InvalidDimAndSizeTooLargeDies) {
+  TensorFactory<ScalarType::Float> tf;
+  const auto input = tf.make({3, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  auto output = tf.zeros({3, 1, 2});
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/3, /*size=*/2, /*step=*/1, output));
+  ET_EXPECT_KERNEL_FAILURE(
+      context_,
+      op_unfold_copy_out(input, /*dim=*/1, /*size=*/10, /*step=*/1, output));
+}
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl
index 91f2121bebc..0d52a3d2d62 100644
--- a/kernels/test/targets.bzl
+++ b/kernels/test/targets.bzl
@@ -324,6 +324,7 @@ def define_common_targets():
     _common_op_test("op_tril_test", ["aten", "portable"])
     _common_op_test("op_trunc_test", ["aten", "portable"])
     _common_op_test("op_unbind_copy_test", ["aten", "portable"])
+    _common_op_test("op_unfold_copy_test", ["aten", "portable"])
     _common_op_test("op_unsqueeze_copy_test", ["aten", "portable"])
     _common_op_test("op_upsample_bilinear2d_test", ["aten", "portable"])
     _common_op_test("op_upsample_nearest2d_test", ["aten", "portable"])
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index f5ddae06b6a..8245f8d345f 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1223,6 +1223,12 @@ ATEN_OPS = (
             "//executorch/kernels/portable/cpu/util:copy_ops_util",
         ],
     ),
+    op_target(
+        name = "op_unfold_copy",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+        ],
+    ),
     op_target(
         name = "op_unsqueeze_copy",
         deps = [