Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions paddle/fluid/operators/average_accumulates_op.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/average_accumulates_op.h"

namespace paddle {
namespace operators {

template <>
void getAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");

old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
num_updates_ = in_num_updates->data<int64_t>()[0];
}

template <>
void setAccumulators<paddle::platform::CPUDeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");

out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
out_num_updates->data<int64_t>()[0] = num_updates_;
}

class AverageAccumulatesOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(
ctx->HasInput("param"),
"Input (param) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_sum_1"),
"Input (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_sum_2"),
"Input (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_sum_3"),
"Input (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_num_accumulates"),
"Input (in_num_accumulates) of average_accumulates op should "
"not be null.");
PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
"Input (old_num_accumulates) of average_accumulates op "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("in_num_updates"),
"Input (num_updates) of average_accumulates op should not be null.");

PADDLE_ENFORCE(
ctx->HasOutput("out_sum_1"),
"Output (sum_1) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("out_sum_2"),
"Output (sum_2) of average_accumulates op should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("out_sum_3"),
"Output (sum_3) of average_accumulates op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
"Output (num_accumulates) of average_accumulates op should "
"not be null.");
PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
"Output (old_num_accumulates) of average_accumulates op "
"should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("out_num_updates"),
"Output (num_updates) of average_accumulates op should not be null.");

auto in_dim = ctx->GetInputDim("param");

ctx->SetOutputDim("out_sum_1", in_dim);
ctx->SetOutputDim("out_sum_2", in_dim);
ctx->SetOutputDim("out_sum_3", in_dim);
ctx->SetOutputDim("out_num_accumulates", {1});
ctx->SetOutputDim("out_old_num_accumulates", {1});
ctx->SetOutputDim("out_num_updates", {1});
}

protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("param")->type()),
ctx.GetPlace());
}
};

class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
public:
AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("param",
"Input(Tensor or LoDTensor): The parameter to be accumulated.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Input(Tensor or LoDTensor) -> (Tensor or LoDTensor)

There is no Input before (

https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L79

The same as below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

AddInput("in_sum_1",
"Input(Tensor or LoDTensor): A tensor used to store the parameter "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now, maybe all the inputs and outputs are Tensor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

"sums with the same shape as input(param).");
AddInput("in_sum_2",
"Input(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param). It is used to avoid loss of precision due to too "
"many sums.");
AddInput("in_sum_3",
"Input(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param).");
AddInput("in_num_accumulates",
"Input(Tensor): The accumulating times of current window with "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tensor<int64_t>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

"shape [1].");
AddInput("in_old_num_accumulates",
"Input(Tensor): The accumulating times of previous window with "
"shape [1].");
AddInput("in_num_updates",
"Input(Tensor): The total number of batches used by trainning "
"before this batch with shape [1].");
Copy link
Contributor

@qingqing01 qingqing01 Mar 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in_num_accumulates
in_old_num_accumulates
in_num_updates

这3个标量用fill_constant初始化的时候可以用fore_cpu属性,让这些标量始终在CPU上,这样GPU计算时,就不用拷贝了。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果op是通过继承OperatorWithKernel 实现的话,在执行之前,这里会判断inputs是不是都是在期望的device上并将其转到期望的device上。
但是,OperatorWithKernel提供的自动转换不支持input和output共享内存的情况.
如果不继承OperatorWithKernel, 应该会有一定的修改工作量,可以放在后续PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

明白了,那就现在这样吧。觉得更好的是,支持Variable<int/float>这样的变量作为op的输入。


AddOutput("out_sum_1",
"Output(Tensor or LoDTensor): A tensor used to store the "
"parameter sums with the same shape as input(param).");
AddOutput("out_sum_2",
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param). It is used to avoid loss of precision due to too "
"many sums.");
AddOutput("out_sum_3",
"Output(Tensor or LoDTensor): A auxiliary tensor to help "
"accumulating sums of parameter values with the same shape as "
"input(param).");
AddOutput("out_num_accumulates",
"Output(Tensor): The accumulating times of current window with "
"shape [1].");
AddOutput("out_old_num_accumulates",
"Output(Tensor): The accumulating times of previous window with "
"shape [1].");
AddOutput("out_num_updates",
"Output(Tensor): The total number of batches used by trainning "
"before this batch with shape [1].");

AddAttr<float>("average_window",
"The rate of average window size relative to num_updates.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set 0. as the default value here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

AddAttr<int64_t>("max_average_window", "Maximum size of average window.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

改下这里的注释吧,让用户手动设置成,一个pass/epoc里总共的mini-batch数。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

AddAttr<int64_t>("min_average_window", "Minimu size of average window.");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set 10000L as the default value for min_average_window ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


AddComment(R"DOC(
AverageAccumulates Operator.
Accumulate the sum of parameter whtin sliding window. The size of sliding window is determined by 'average_window', 'max_average_window' and 'min_average_window'.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to more details to show how to average.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

)DOC");
}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
ops::AverageAccumulatesOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
63 changes: 63 additions & 0 deletions paddle/fluid/operators/average_accumulates_op.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/average_accumulates_op.h"
#include "paddle/fluid/platform/gpu_info.h"

namespace paddle {
namespace operators {
template <>
void getAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t& num_updates_,
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
}

template <>
void setAccumulators<paddle::platform::CUDADeviceContext>(
const framework::ExecutionContext& ctx, int64_t num_updates_,
int64_t num_accumulates_, int64_t old_num_accumulates_) {
auto stream = ctx.cuda_device_context().stream();
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");

memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
stream);
memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
}

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
average_accumulates,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
110 changes: 110 additions & 0 deletions paddle/fluid/operators/average_accumulates_op.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"

namespace paddle {
namespace operators {

using Tensor = framework::Tensor;

template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;

template <typename DeviceContext>
void getAccumulators(const framework::ExecutionContext& ctx,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getAccumulators -> GetAccumulators

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

int64_t& num_updates, int64_t& num_accumulates,
int64_t& old_num_accumulates);

template <typename DeviceContext>
void setAccumulators(const framework::ExecutionContext& ctx,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

setAccumulators -> SetAccumulators

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

int64_t num_updates, int64_t num_accumulates,
int64_t old_num_accumulates);

template <typename DeviceContext, typename T>
class AverageAccumulatesKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// It is used to avoid loss of precision
static const int64_t kMaxNumAccumulates = 16384;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reference paper for kMaxNumAccumulates 16384?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that 16384 is an experimental value. There are no reference papers.

// Get accumulators from input
int64_t num_updates = 0;
int64_t num_accumulates = 0;
int64_t old_num_accumulates = 0;
getAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
old_num_accumulates);

// Get attrs
float average_window = ctx.Attr<float>("average_window");
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
min_average_window =
std::min<int64_t>(min_average_window, max_average_window);

// Get inputs
auto* param = ctx.Input<Tensor>("param");
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
auto param_tensor = EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);

// Get outputs
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);

// Compute
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::SetConstant<DeviceContext, T> constant_functor;
++num_updates;
++num_accumulates;
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
out_sum_2_tensor.device(place) = in_sum_2_tensor;
out_sum_3_tensor.device(place) = in_sum_3_tensor;
if (num_updates % kMaxNumAccumulates == 0) {
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comments before lin 87:

Move the sum to a different buffer to avoid loss of precision due to too many sums.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0);
}
if (num_accumulates >= min_average_window &&
num_accumulates >= std::min<int64_t>(max_average_window,
num_updates * average_window)) {
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comments before line 94:

Now the average window is too long, discard the old sum.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
0.0);
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
0.0);
old_num_accumulates = num_accumulates;
num_accumulates = 0;
}

// Set accumulators to output
setAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
old_num_accumulates);
}
};

} // namespace operators
} // namespace paddle
Loading