diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
index bbc9eec3a0e..a251891e622 100644
--- a/build/cmake_deps.toml
+++ b/build/cmake_deps.toml
@@ -89,6 +89,7 @@ deps = [
   "executorch",
   "executorch_core",
   "extension_threadpool",
+  "optimized_cpublas",
   "portable_kernels",
 ]
 
@@ -146,6 +147,7 @@ deps = [
   "executorch_core",
   "executorch",
   "extension_threadpool",
+  "optimized_cpublas",
   "portable_kernels",
 ]
 # ---------------------------------- core end ----------------------------------
@@ -413,6 +415,7 @@ excludes = [
 deps = [
   "executorch",
   "executorch_core",
+  "optimized_cpublas",
   "optimized_kernels",
   "extension_threadpool",
   "reduce_util",
@@ -452,6 +455,7 @@ deps = [
   "extension_data_loader",
   "extension_module",
   "extension_threadpool",
+  "optimized_cpublas",
   "portable_kernels",
   "quantized_kernels",
   "xnnpack_backend",
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 931d31de8ef..13ec0b876ad 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -89,6 +89,7 @@ set(lib_list
     pthreadpool
     vulkan_backend
     optimized_kernels
+    optimized_portable_kernels
     cpublas
     eigen_blas
     optimized_ops_lib
@@ -132,7 +133,26 @@ endforeach()
 # target_compile_options/target_compile_definitions for everything.
 if(TARGET cpublas)
   set_target_properties(
-    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
+    cpublas PROPERTIES INTERFACE_LINK_LIBRARIES
+                       "extension_threadpool;eigen_blas"
+  )
+endif()
+if(TARGET optimized_kernels)
+  set_target_properties(
+    optimized_kernels PROPERTIES INTERFACE_LINK_LIBRARIES
+                                 "executorch_core;cpublas;extension_threadpool"
+  )
+endif()
+if(TARGET optimized_native_cpu_ops_lib)
+  if(TARGET optimized_portable_kernels)
+    set(_maybe_optimized_portable_kernels_lib optimized_portable_kernels)
+  else()
+    set(_maybe_optimized_portable_kernels_lib portable_kernels)
+  endif()
+  set_target_properties(
+    optimized_native_cpu_ops_lib
+    PROPERTIES INTERFACE_LINK_LIBRARIES
+               "optimized_kernels;${_maybe_optimized_portable_kernels_lib}"
   )
 endif()
 if(TARGET extension_threadpool)
diff --git a/configurations/CMakeLists.txt b/configurations/CMakeLists.txt
index 462124a6ea6..cf304d92523 100644
--- a/configurations/CMakeLists.txt
+++ b/configurations/CMakeLists.txt
@@ -47,12 +47,17 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   message("Generated files ${gen_command_sources}")
 
   # optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime
+  if(TARGET optimized_portable_kernels)
+    set(_optimized_native_cpu_ops_lib_portable_kernels_lib optimized_portable_kernels)
+  else()
+    set(_optimized_native_cpu_ops_lib_portable_kernels_lib portable_kernels)
+  endif()
   gen_operators_lib(
     LIB_NAME
     "optimized_native_cpu_ops_lib"
     KERNEL_LIBS
-    portable_kernels
     optimized_kernels
+    ${_optimized_native_cpu_ops_lib_portable_kernels_lib}
     DEPS
     executorch
   )
diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt
index ba722d9c791..03595efbfea 100644
--- a/extension/android/CMakeLists.txt
+++ b/extension/android/CMakeLists.txt
@@ -84,10 +84,6 @@ if(TARGET optimized_native_cpu_ops_lib)
     APPEND
     link_libraries
     optimized_native_cpu_ops_lib
-    optimized_kernels
-    portable_kernels
-    cpublas
-    eigen_blas
   )
   target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 else()
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 885c509246b..e15970329c1 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -63,6 +63,22 @@ gen_operators_lib(
   LIB_NAME "portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
 )
 
+# Portable kernels support optional parallelization (and, in the
+# future, perhaps other performance features). If support is present,
+# produce an optimized version.
+set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
+
+if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+  add_library(optimized_portable_kernels ${_portable_kernels__srcs})
+  target_link_libraries(optimized_portable_kernels PRIVATE executorch)
+  target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
+  target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  install(
+    TARGETS optimized_portable_kernels
+    DESTINATION lib
+  )
+endif()
+
 install(
   TARGETS portable_kernels portable_ops_lib
   DESTINATION lib
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index a0ee82d2612..87e90de4c04 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -12,6 +12,7 @@
 
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/kernel/thread_parallel_interface.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch {
@@ -47,30 +48,43 @@ Tensor& argmin_out(
   ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
     long* out_data = out.mutable_data_ptr<long>();
 
-    for (const auto out_ix : c10::irange(out.numel())) {
-      std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
-          [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            // the below condition as written is equivalent to !isnan(accval) &&
-            // (isnan(v) || v < acc_val). cases:
-            // - if neither acc_val nor v is NaN, !(v >= acc_val) is
-            //   trivially equivalent to v < acc_val.
-            // - if acc_val is NaN, the whole thing is trivially false.
-            // - if acc_val is not NaN and v is NaN, then v >= acc_val
-            // - is false because all comparisons involving NaN are
-            // - false, so the result is true. The result is trivially
-            // - true for the above condition that uses isnan(v) as
-            // - well.
-            if (!std::isnan(acc_val) && !(v >= acc_val)) {
-              acc_val = v;
-              acc_ix = ix;
-            }
-            return std::tuple<CTYPE, long>{acc_val, acc_ix};
-          },
-          in,
-          dim,
-          out_ix);
-      out_data[out_ix] = std::get<1>(acc);
-    }
+    // REVIEW: this is the parallelization strategy ATen uses
+    // specifically when the reduction is along the last dimension and
+    // that dimension is contiguous. Is there any particular reason we
+    // shouldn't just always use this strategy since we aren't
+    // otherwise capable of parallelizing reductions?
+    const int64_t reduction_size = get_reduced_dim_product(in, dim);
+    const auto grain_size = std::max(
+        static_cast<int64_t>(1),
+        executorch::extension::internal::GRAIN_SIZE / reduction_size);
+    const bool success = executorch::extension::parallel_for(
+        0, out.numel(), grain_size, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
+                [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
+                  // the below condition as written is equivalent to
+                  // !isnan(accval) && (isnan(v) || v < acc_val). cases:
+                  // - if neither acc_val nor v is NaN, !(v >= acc_val) is
+                  //   trivially equivalent to v < acc_val.
+                  // - if acc_val is NaN, the whole thing is trivially false.
+                  // - if acc_val is not NaN and v is NaN, then v >= acc_val
+                  // - is false because all comparisons involving NaN are
+                  // - false, so the result is true. The result is trivially
+                  // - true for the above condition that uses isnan(v) as
+                  // - well.
+                  if (!std::isnan(acc_val) && !(v >= acc_val)) {
+                    acc_val = v;
+                    acc_ix = ix;
+                  }
+                  return std::tuple<CTYPE, long>{acc_val, acc_ix};
+                },
+                in,
+                dim,
+                out_ix);
+            out_data[out_ix] = std::get<1>(acc);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp
index 09ba508a31d..31296d67ee7 100644
--- a/kernels/portable/cpu/util/reduce_util.cpp
+++ b/kernels/portable/cpu/util/reduce_util.cpp
@@ -83,12 +83,8 @@ size_t get_reduced_dim_product(
   if (in.dim() == 0) {
     return 1;
   }
-  size_t dim_product = 1;
   if (!dim.has_value()) {
-    for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
-      dim_product *= in.size(i);
-    }
-    return dim_product;
+    return in.numel();
   }
   const size_t d = _normalize_non_neg_d(dim.value(), in.dim());
   return in.size(d);
@@ -104,16 +100,12 @@ size_t get_reduced_dim_product(
   if (in.dim() == 0) {
     return 1;
   }
-  size_t dim_product = 1;
-  const size_t in_dim = in.dim();
   if (!dim_list.has_value() || dim_list.value().size() == 0) {
-    for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
-      dim_product *= in.size(i);
-    }
-    return dim_product;
+    return in.numel();
   }
+  size_t dim_product = 1;
   for (const auto& d : dim_list.value()) {
-    const size_t non_neg_d = _normalize_non_neg_d(d, in_dim);
+    const size_t non_neg_d = _normalize_non_neg_d(d, in.dim());
     dim_product *= in.size(non_neg_d);
   }
   return dim_product;
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
index 394ec241698..af0716b06ed 100644
--- a/kernels/test/CMakeLists.txt
+++ b/kernels/test/CMakeLists.txt
@@ -23,11 +23,11 @@ foreach(kernel ${_kernels})
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/test"
   )
   set(_wrapper_path "${_wrapper_dir}/FunctionHeaderWrapper.h")
+  set(_functions_include "#include <executorch/kernels/${kernel}/Functions.h>")
   add_custom_command(
     OUTPUT "${_wrapper_path}"
     COMMAND mkdir -p ${_wrapper_dir}
-    COMMAND echo "#include <executorch/kernels/${kernel}/Functions.h>" >
-            "${_wrapper_path}"
+    COMMAND echo ${_functions_include} > "${_wrapper_path}"
     DEPENDS
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h"
@@ -53,7 +53,17 @@ foreach(kernel ${_kernels})
     COMMENT "Generating ${_wrapper_dir}/supported_features.cpp and header"
     VERBATIM
   )
-
+  if(${kernel} STREQUAL "optimized")
+    set(_kernel_ops_lib "optimized_native_cpu_ops_lib")
+    set(_kernel_ops_lib_path
+        "${CMAKE_CURRENT_BINARY_DIR}/../../configurations/optimized_native_cpu_ops_lib"
+    )
+  else()
+    set(_kernel_ops_lib "${kernel}_ops_lib")
+    set(_kernel_ops_lib_path
+        "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib"
+    )
+  endif()
   add_custom_command(
     OUTPUT
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
@@ -63,10 +73,9 @@ foreach(kernel ${_kernels})
       mkdir -p
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
     COMMAND
-      cp
-      "${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib/*.h"
+      cp "${_kernel_ops_lib_path}/*.h"
       "${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
-    DEPENDS "${kernel}_ops_lib"
+    DEPENDS ${_kernel_ops_lib}
   )
 endforeach()
 
@@ -280,6 +289,11 @@ set(_optimized_kernels_test_sources
     ${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp
 )
 
+if(TARGET optimized_portable_kernels)
+  list(APPEND _optimized_kernels_test_sources ${all_test_sources})
+  list(REMOVE_DUPLICATES _optimized_kernels_test_sources)
+endif()
+
 et_cxx_test(
   optimized_kernels_test
   SOURCES
@@ -287,16 +301,16 @@ et_cxx_test(
   EXTRA_LIBS
   cpuinfo
   extension_threadpool
-  optimized_kernels
-  optimized_ops_lib
-  portable_kernels
+  optimized_native_cpu_ops_lib
   pthreadpool
   eigen_blas
 )
 add_dependencies(optimized_kernels_test generate_wrapper)
 target_include_directories(
-  optimized_kernels_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized"
-                                 "${CMAKE_INSTALL_PREFIX}/include"
+  optimized_kernels_test
+  PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized"
+          "${CMAKE_CURRENT_BINARY_DIR}/include/portable"
+          "${CMAKE_INSTALL_PREFIX}/include"
 )
 
 if(TARGET quantized_kernels)
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index b56413b92f4..dd48da64c30 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -284,6 +284,7 @@ ATEN_OPS = (
         name = "op_argmin",
         deps = [
             "//executorch/kernels/portable/cpu/util:reduce_util",
+            "//executorch/runtime/kernel:thread_parallel_interface",
         ],
     ),
     op_target(