pytorch
diff --git a/‎extension/training/module/training_module.cpp
Lines changed: 4 additions & 0 deletions b/‎extension/training/module/training_module.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎extension/training/module/training_module.h
Lines changed: 1 addition & 2 deletions b/‎extension/training/module/training_module.h
Lines changed: 1 addition & 2 deletions
diff --git a/‎extension/training/optimizer/sgd.cpp
Lines changed: 73 additions & 94 deletions b/‎extension/training/optimizer/sgd.cpp
Lines changed: 73 additions & 94 deletions
diff --git a/‎extension/training/optimizer/sgd.h
Lines changed: 22 additions & 39 deletions b/‎extension/training/optimizer/sgd.h
Lines changed: 22 additions & 39 deletions
@@ -107,6 +107,10 @@ TrainingModule::named_parameters(const std::string& method_name) {
 
   uint64_t param_start = param_res.get()[0].toInt();
 
+  auto e = executorch::extension::Module::load_method(method_name);
+  if (e != runtime::Error::Ok) {
+    return e;
+  }
   auto& method = methods_.at(method_name).method;
 
   // create dict
 
@@ -68,8 +68,7 @@ class ET_EXPERIMENTAL TrainingModule final : executorch::extension::Module {
    * parameters for.
    *
    * @returns A Result object containing a map of the fully qualified name to
-   * parameter tensor, or an error if the method is not a joint graph or has not
-   * been executed yet.
+   * parameter tensor, or an error if the method is not a joint graph.
    */
   ET_EXPERIMENTAL
   runtime::Result<const std::map<exec_aten::string_view, exec_aten::Tensor>>
 
@@ -16,7 +16,6 @@ using exec_aten::Tensor;
 using exec_aten::TensorImpl;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
-using ::executorch::runtime::Span;
 
 namespace executorch {
 namespace extension {
@@ -39,25 +38,13 @@ void SGDParamGroup::set_options(std::unique_ptr<SGDOptions> options) {
   options_ = std::move(options);
 }
 
-Span<const char*> SGDParamGroup::param_names() {
-  return param_names_;
-}
-
-const Span<const char*> SGDParamGroup::param_names() const {
-  return param_names_;
-}
-
-Span<Tensor> SGDParamGroup::param_data() {
-  return param_data_;
-}
-
-const Span<Tensor> SGDParamGroup::param_data() const {
-  return param_data_;
+const std::map<exec_aten::string_view, exec_aten::Tensor>&
+SGDParamGroup::named_parameters() const {
+  return named_parameters_;
 }
 
 void SGD::add_param_group(const SGDParamGroup& param_group) {
-  SGDParamGroup param_group_(
-      param_group.param_names(), param_group.param_data());
+  SGDParamGroup param_group_(param_group.named_parameters());
   if (!param_group.has_options()) {
     param_group_.set_options(defaults_->clone());
   } else {
@@ -66,13 +53,8 @@ void SGD::add_param_group(const SGDParamGroup& param_group) {
   param_groups_.emplace_back(std::move(param_group_));
 }
 
-Error SGD::step(Span<const char*> gradient_names, Span<Tensor> gradient_data) {
-  // check that the number of gradient names matches the number of gradients
-  ET_CHECK_OR_RETURN_ERROR(
-      gradient_names.size() == gradient_data.size(),
-      InvalidState,
-      "Gradient names and gradients must have the same length.");
-
+Error SGD::step(const std::map<exec_aten::string_view, exec_aten::Tensor>&
+                    named_gradients) {
   KernelRuntimeContext context;
   for (auto& group : param_groups_) {
     auto& options = static_cast<SGDOptions&>(group.options());
@@ -81,85 +63,82 @@ Error SGD::step(Span<const char*> gradient_names, Span<Tensor> gradient_data) {
     auto dampening = options.dampening();
     auto nesterov = options.nesterov();
 
-    for (int i = 0; i < group.param_names().size(); i++) {
-      for (int j = 0; j < gradient_names.size(); j++) {
-        // if param name and gradient name match, run the optimizer step
-        if (strcmp(group.param_names()[i], gradient_names[j]) == 0) {
-          auto d_p = gradient_data[j];
-          auto p = group.param_data()[i];
-          if (weight_decay != 0) {
-            // uses weight_decay specified and adds it to the gradient
-            torch::executor::aten::add_outf(context, d_p, p, weight_decay, d_p);
-            if (context.failure_state() != Error::Ok) {
-              return context.failure_state();
-            }
+    for (auto param_iter = group.named_parameters().begin();
+         param_iter != group.named_parameters().end();
+         ++param_iter) {
+      // if param name and gradient name match, run the optimizer step
+      const auto& named_gradient = named_gradients.find(param_iter->first);
+      if (named_gradient != named_gradients.end()) {
+        auto d_p = named_gradient->second;
+        auto p = param_iter->second;
+        if (weight_decay != 0) {
+          // uses weight_decay specified and adds it to the gradient
+          torch::executor::aten::add_outf(context, d_p, p, weight_decay, d_p);
+          if (context.failure_state() != Error::Ok) {
+            return context.failure_state();
           }
-          if (momentum != 0) {
-            Tensor buf(nullptr);
-            auto param_state = state_.find(p.unsafeGetTensorImpl());
-            // look for the momentum buffer for the given parameter. this is the
-            // momentum as of the previous epoch
-            if (param_state == state_.end()) {
-              // create a new momentum buffer if it doesn't exist. this memory
-              // needs to be freed when the optimizer is destroyed
-              void* buf_ptr = malloc(d_p.nbytes());
+        }
+        if (momentum != 0) {
+          Tensor buf(nullptr);
+          auto param_state = state_.find(p.unsafeGetTensorImpl());
+          // look for the momentum buffer for the given parameter. this is the
+          // momentum as of the previous epoch
+          if (param_state == state_.end()) {
+            // create a new momentum buffer if it doesn't exist. this memory
+            // needs to be freed when the optimizer is destroyed
+            void* buf_ptr = malloc(d_p.nbytes());
 
 #ifdef USE_ATEN_LIB
-              std::vector<int64_t> sizes(
-                  d_p.sizes().begin(), d_p.sizes().end());
-              buf = torch::from_blob(buf_ptr, sizes, d_p.scalar_type());
+            std::vector<int64_t> sizes(d_p.sizes().begin(), d_p.sizes().end());
+            buf = torch::from_blob(buf_ptr, sizes, d_p.scalar_type());
 #else
-              TensorImpl* buf_impl = new TensorImpl(
-                  d_p.scalar_type(),
-                  d_p.sizes().size(),
-                  const_cast<TensorImpl::SizesType*>(d_p.sizes().data()),
-                  buf_ptr,
-                  const_cast<TensorImpl::DimOrderType*>(
-                      d_p.dim_order().data()));
-              buf = Tensor(buf_impl);
+            TensorImpl* buf_impl = new TensorImpl(
+                d_p.scalar_type(),
+                d_p.sizes().size(),
+                const_cast<TensorImpl::SizesType*>(d_p.sizes().data()),
+                buf_ptr,
+                const_cast<TensorImpl::DimOrderType*>(d_p.dim_order().data()));
+            buf = Tensor(buf_impl);
 #endif
-              torch::executor::aten::clone_outf(
-                  context, d_p, exec_aten::MemoryFormat::Contiguous, buf);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
-
-              // save the state of the momentum buffer to be reused in later
-              // epochs
-              auto state = std::make_unique<SGDParamState>(buf);
-              state_[p.unsafeGetTensorImpl()] = std::move(state);
-            } else {
-              buf = static_cast<SGDParamState&>(*param_state->second)
-                        .momentum_buffer();
-
-              // update the momentum buffer and apply dampening
-              torch::executor::aten::mul_outf(context, buf, momentum, buf);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
-              torch::executor::aten::add_outf(
-                  context, buf, d_p, 1 - dampening, buf);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
+            torch::executor::aten::clone_outf(
+                context, d_p, exec_aten::MemoryFormat::Contiguous, buf);
+            if (context.failure_state() != Error::Ok) {
+              return context.failure_state();
             }
-            if (nesterov) {
-              // apply nesterov momentum
-              torch::executor::aten::add_outf(context, d_p, buf, momentum, d_p);
-              if (context.failure_state() != Error::Ok) {
-                return context.failure_state();
-              }
-            } else {
-              d_p = buf;
+
+            // save the state of the momentum buffer to be reused in later
+            // epochs
+            auto state = std::make_unique<SGDParamState>(buf);
+            state_[p.unsafeGetTensorImpl()] = std::move(state);
+          } else {
+            buf = static_cast<SGDParamState&>(*param_state->second)
+                      .momentum_buffer();
+
+            // update the momentum buffer and apply dampening
+            torch::executor::aten::mul_outf(context, buf, momentum, buf);
+            if (context.failure_state() != Error::Ok) {
+              return context.failure_state();
+            }
+            torch::executor::aten::add_outf(
+                context, buf, d_p, 1 - dampening, buf);
+            if (context.failure_state() != Error::Ok) {
+              return context.failure_state();
             }
           }
-          // update the parameter using the gradient and learning rate
-          torch::executor::aten::add_outf(
-              context, p, d_p, -1 * options.lr(), p);
-          if (context.failure_state() != Error::Ok) {
-            return context.failure_state();
+          if (nesterov) {
+            // apply nesterov momentum
+            torch::executor::aten::add_outf(context, d_p, buf, momentum, d_p);
+            if (context.failure_state() != Error::Ok) {
+              return context.failure_state();
+            }
+          } else {
+            d_p = buf;
           }
-          break;
+        }
+        // update the parameter using the gradient and learning rate
+        torch::executor::aten::add_outf(context, p, d_p, -1 * options.lr(), p);
+        if (context.failure_state() != Error::Ok) {
+          return context.failure_state();
         }
       }
     }
 
@@ -18,7 +18,7 @@
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <executorch/runtime/core/span.h>
+#include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -133,52 +133,42 @@ class SGDParamGroup {
   // NOTE: In order to store `SGDParamGroup` in a `std::vector`, it has
   // to be copy-constructible.
   SGDParamGroup(const SGDParamGroup& param_group)
-      : param_data_(param_group.param_data()),
-        param_names_(param_group.param_names()),
+      : named_parameters_(param_group.named_parameters()),
         options_(
             param_group.has_options() ? param_group.options().clone()
                                       : nullptr) {}
   SGDParamGroup& operator=(const SGDParamGroup& param_group) {
-    this->param_data_ = param_group.param_data();
-    this->param_names_ = param_group.param_names();
+    this->named_parameters_ = param_group.named_parameters_;
     this->options_ =
         param_group.has_options() ? param_group.options().clone() : nullptr;
     return *this;
   }
 
   /**
-   * This constructs a SGD param group. We expect that the two spans are of the
-   * same size, and that for a given param data, its index in param_data is the
-   * same as its param name in param_name.
+   * Constructs a SGD param group.
    *
-   * @param[in] param_names The names of the params for this group.
-   * @param[in] param_data The tensors representing the param data.
+   * @param[in] named_parameters The parameters to be optimized and their fully
+   * qualified names.
    */
   /* implicit */ SGDParamGroup(
-      ::executorch::runtime::Span<const char*> param_names,
-      ::executorch::runtime::Span<exec_aten::Tensor> param_data)
-      : param_data_(std::move(param_data)),
-        param_names_(std::move(param_names)) {}
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_parameters)
+      : named_parameters_(named_parameters) {}
   SGDParamGroup(
-      ::executorch::runtime::Span<const char*> param_names,
-      ::executorch::runtime::Span<exec_aten::Tensor> param_data,
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_parameters,
       std::unique_ptr<SGDOptions> options)
-      : param_data_(std::move(param_data)),
-        param_names_(std::move(param_names)),
-        options_(std::move(options)) {}
+      : named_parameters_(named_parameters), options_(std::move(options)) {}
 
   bool has_options() const;
   SGDOptions& options();
   const SGDOptions& options() const;
   void set_options(std::unique_ptr<SGDOptions> options);
-  ::executorch::runtime::Span<const char*> param_names();
-  const ::executorch::runtime::Span<const char*> param_names() const;
-  ::executorch::runtime::Span<exec_aten::Tensor> param_data();
-  const ::executorch::runtime::Span<exec_aten::Tensor> param_data() const;
+  const std::map<exec_aten::string_view, exec_aten::Tensor>& named_parameters()
+      const;
 
  private:
-  ::executorch::runtime::Span<exec_aten::Tensor> param_data_;
-  ::executorch::runtime::Span<const char*> param_names_;
+  std::map<exec_aten::string_view, exec_aten::Tensor> named_parameters_;
   std::unique_ptr<SGDOptions> options_;
 };
 
@@ -198,11 +188,10 @@ class SGD {
   }
 
   explicit SGD(
-      ::executorch::runtime::Span<const char*> param_names,
-      ::executorch::runtime::Span<exec_aten::Tensor> param_data,
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_parameters,
       SGDOptions defaults)
-      : SGD({SGDParamGroup(std::move(param_names), std::move(param_data))},
-            defaults) {}
+      : SGD({SGDParamGroup(named_parameters)}, defaults) {}
 
   // Adds the given param_group to the optimizer's param_group list.
   void add_param_group(const SGDParamGroup& param_group);
@@ -212,18 +201,12 @@ class SGD {
   /**
    * Performs the optimization step.
    *
-   * The two spans must be of the same size. It is expected that the gradient in
-   * 'gradient_data' at index 'i' represents the gradient calculated in the loss
-   * function for the parameter with the name in 'gradient_names' at index 'i'.
-   *
-   * @param[in] gradient_names The names of the params that matches the gradient
-   *   in 'gradient_data' at the same index.
-   * @param[in] gradient_data The gradient tensors to be used for optimization
-   *   step.
+   * @param[in] named_gradients The gradients of the tensors specified by the
+   * fully qualified name.
    */
   ::executorch::runtime::Error step(
-      ::executorch::runtime::Span<const char*> gradient_names,
-      ::executorch::runtime::Span<exec_aten::Tensor> gradient_data);
+      const std::map<exec_aten::string_view, exec_aten::Tensor>&
+          named_gradients);
 
  private:
   std::vector<SGDParamGroup> param_groups_;