support soft labels.

lcy-seso · lcy-seso · commit f1d5fb3b9a62 · 2017-09-22T10:10:28.000+08:00
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
@@ -1,12 +1,12 @@
 if(WITH_GPU)
     nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
       im2col.cu DEPS cblas device_context operator)
-    nv_library(softmax_function SRCS softmax_function.cc softmax_function.cu
+    nv_library(softmax_function SRCS softmax.cc softmax.cu
       DEPS operator)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc
       DEPS cblas device_context operator)
-    cc_library(softmax_function SRCS softmax_function.cc DEPS operator)
+    cc_library(softmax_function SRCS softmax.cc DEPS operator)
 endif()
 
 nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
@@ -14,7 +14,7 @@
 
 #define EIGEN_USE_GPU
 
-#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
diff --git a/paddle/operators/math/utils.h b/paddle/operators/math/utils.h
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax_function.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -23,16 +23,32 @@ class SoftmaxWithCrossEntropyOpMaker
   SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
                                  framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
+    //(TODO caoying) replace int with boolean
+    AddAttr<int>("soft_label",
+                 "(int, default 0), A flag to indicate whether to interpretate "
+                 "the given labels as soft labels.")
+        .SetDefault(0);
     AddInput("Logits",
-             "The unscaled log probabilities which is a 2-D tensor<float> with"
-             "shape [N x K]. N is the batch_size, and K is the class number.")
+             "(Tensor, default Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.")
         .NotInGradient();
-    AddInput("Label", "The ground truth. A 1-D tensor<int> with shape N.");
-    AddOutput("Softmax",
-              "Store the outputs of softmax function, "
-              "which will be used in backward calculation.")
+    AddInput(
+        "Label",
+        "(Tensor, default Tensor<int>), The ground truth which is "
+        "a 1-D or 2-D tensor. "
+        "If soft_label is set to 0, Label is a Tensor<int> with shape [N x 1]. "
+        "If soft_label is set to 1, Label is a Tensor<float/double> "
+        "with shape [N x K].");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default Tensor<float>), A 2-D tensor with shape [N x K]. "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.")
         .AsIntermediate();
-    AddOutput("Out", "A 1-D tensor<float> with shape N.");
+    AddOutput("Loss",
+              "(Tensor, default Tensor<float>), A 1-D tensor. The cross "
+              "entropy loss with shape [N x 1].");
     AddComment(R"DOC(
 Cross entropy loss with softmax are used as the output layer extensively. This
 operator computes the softmax normalized values for each row of the input
@@ -46,25 +62,18 @@ which will produce incorrect results.
 This operators expects mutually exclusive hard labels, each sample in a batch
 is in exactly one class with probabilities 1. Each sample in the batch with one
 and only one label.
-)DOC");
-  }
-};
 
-class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+Equation:
 
- protected:
-  void InferShape(const framework::InferShapeContext& ctx) const override {
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
-                            "Input(Out@Grad) should not be null");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
-                            "Input(Softmax) should be not null.");
-    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
-                            "Input(Lable) should be not null.");
+1) hard label (one-hot label)
 
-    ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
-        ->Resize(ctx.Input<Tensor>("Softmax")->dims());
+Loss_j = -\text{Logit}_{Label_j} + \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), j = 1, ..., K
+
+2) soft label (a distribution over all classes)
+
+Loss_j = -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), j = 1,...,K
+
+)DOC");
   }
 };
 
@@ -82,7 +91,25 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
                    "The label should be a 1-d tensor.");
 
     ctx.Output<framework::LoDTensor>("Softmax")->Resize(logits->dims());
-    ctx.Output<framework::LoDTensor>("Out")->Resize({logits->dims()[0], 1});
+    ctx.Output<framework::LoDTensor>("Loss")->Resize({logits->dims()[0], 1});
+  }
+};
+
+class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Loss")),
+                            "Input(Loss@Grad) should not be null");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Softmax"),
+                            "Input(Softmax) should be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input(Lable) should be not null.");
+
+    ctx.Output<framework::LoDTensor>(framework::GradVarName("Logits"))
+        ->Resize(ctx.Input<Tensor>("Softmax")->dims());
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -13,9 +13,10 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
+
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax_function.h"
-#include "paddle/operators/math/utils.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
@@ -27,9 +28,10 @@ __global__ void CrossEntropyKernel(T* out, const T* softmax_out,
                                    const int* label, const int batch_size,
                                    const int class_num) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= batch_size) return;
-  PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
-  out[i] = -math::tolerable_value(log(softmax_out[i * class_num + label[i]]));
+  if (i < batch_size) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
+    out[i] = -tolerable_value(std::log(softmax_out[i * class_num + label[i]]));
+  }
 }
 
 template <typename T>
@@ -38,10 +40,10 @@ __global__ void CrossEntropyWithSoftmaxGradKernel(T* softmax_out,
                                                   const int batch_size,
                                                   const int class_num) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= batch_size) return;
-
-  PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
-  softmax_out[i * class_num + label[i]] -= 1.;
+  if (i < batch_size) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < class_num);
+    softmax_out[i * class_num + label[i]] -= 1.;
+  }
 }
 
 template <typename T>
@@ -60,7 +62,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
 
     // Calculate the cross entropy loss based on hard labels.
     const int* label_data = context.Input<Tensor>("Label")->data<int>();
-    Tensor* loss = context.Output<Tensor>("Out");
+    Tensor* loss = context.Output<Tensor>("Loss");
     loss->mutable_data<T>(context.GetPlace());
     T* loss_data = loss->data<T>();
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -15,8 +15,8 @@
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/softmax_function.h"
-#include "paddle/operators/math/utils.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/operators/math/softmax.h"
 
 namespace paddle {
 namespace operators {
@@ -44,7 +44,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
     T* softmax_out = softmax->data<T>();
     const int* label_data = context.Input<Tensor>("Label")->data<int>();
 
-    Tensor* loss = context.Output<Tensor>("Out");
+    Tensor* loss = context.Output<Tensor>("Loss");
     loss->mutable_data<T>(context.GetPlace());
     T* loss_data = loss->data<T>();
 
@@ -53,7 +53,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
 
     for (int i = 0; i < batch_size; ++i) {
       int index = i * class_num + label_data[i];
-      loss_data[i] = -math::tolerable_value(std::log(softmax_out[index]));
+      loss_data[i] = -tolerable_value(std::log(softmax_out[index]));
     }
   }
 };
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -25,13 +25,13 @@ def setUp(self):
             dtype="float32")
 
         self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {"Softmax": softmax, "Out": cross_entropy}
+        self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Out", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
 
 
 if __name__ == "__main__":