Skip to content

Commit b594251

Browse files
Merge pull request #9082 from wanghaoshuang/average_model
Add model average optimizer for fluid
2 parents 1d8fe2a + edb4e29 commit b594251

File tree

6 files changed

+568
-8
lines changed

6 files changed

+568
-8
lines changed
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/operators/average_accumulates_op.h"
16+
17+
namespace paddle {
18+
namespace operators {
19+
20+
template <>
21+
void GetAccumulators<paddle::platform::CPUDeviceContext>(
22+
const framework::ExecutionContext& ctx, int64_t& num_updates_,
23+
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
24+
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
25+
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
26+
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
27+
28+
old_num_accumulates_ = in_old_num_accumulates->data<int64_t>()[0];
29+
num_accumulates_ = in_num_accumulates->data<int64_t>()[0];
30+
num_updates_ = in_num_updates->data<int64_t>()[0];
31+
}
32+
33+
template <>
34+
void SetAccumulators<paddle::platform::CPUDeviceContext>(
35+
const framework::ExecutionContext& ctx, int64_t num_updates_,
36+
int64_t num_accumulates_, int64_t old_num_accumulates_) {
37+
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
38+
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
39+
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
40+
41+
out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates_;
42+
out_num_accumulates->data<int64_t>()[0] = num_accumulates_;
43+
out_num_updates->data<int64_t>()[0] = num_updates_;
44+
}
45+
46+
class AverageAccumulatesOp : public framework::OperatorWithKernel {
47+
public:
48+
using framework::OperatorWithKernel::OperatorWithKernel;
49+
50+
void InferShape(framework::InferShapeContext* ctx) const override {
51+
PADDLE_ENFORCE(
52+
ctx->HasInput("param"),
53+
"Input (param) of average_accumulates op should not be null.");
54+
PADDLE_ENFORCE(
55+
ctx->HasInput("in_sum_1"),
56+
"Input (sum_1) of average_accumulates op should not be null.");
57+
PADDLE_ENFORCE(
58+
ctx->HasInput("in_sum_2"),
59+
"Input (sum_2) of average_accumulates op should not be null.");
60+
PADDLE_ENFORCE(
61+
ctx->HasInput("in_sum_3"),
62+
"Input (sum_3) of average_accumulates op should not be null.");
63+
PADDLE_ENFORCE(
64+
ctx->HasInput("in_num_accumulates"),
65+
"Input (in_num_accumulates) of average_accumulates op should "
66+
"not be null.");
67+
PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
68+
"Input (old_num_accumulates) of average_accumulates op "
69+
"should not be null.");
70+
PADDLE_ENFORCE(
71+
ctx->HasInput("in_num_updates"),
72+
"Input (num_updates) of average_accumulates op should not be null.");
73+
74+
PADDLE_ENFORCE(
75+
ctx->HasOutput("out_sum_1"),
76+
"Output (sum_1) of average_accumulates op should not be null.");
77+
PADDLE_ENFORCE(
78+
ctx->HasOutput("out_sum_2"),
79+
"Output (sum_2) of average_accumulates op should not be null.");
80+
PADDLE_ENFORCE(
81+
ctx->HasOutput("out_sum_3"),
82+
"Output (sum_3) of average_accumulates op should not be null.");
83+
PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
84+
"Output (num_accumulates) of average_accumulates op should "
85+
"not be null.");
86+
PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
87+
"Output (old_num_accumulates) of average_accumulates op "
88+
"should not be null.");
89+
PADDLE_ENFORCE(
90+
ctx->HasOutput("out_num_updates"),
91+
"Output (num_updates) of average_accumulates op should not be null.");
92+
93+
auto in_dim = ctx->GetInputDim("param");
94+
95+
ctx->SetOutputDim("out_sum_1", in_dim);
96+
ctx->SetOutputDim("out_sum_2", in_dim);
97+
ctx->SetOutputDim("out_sum_3", in_dim);
98+
ctx->SetOutputDim("out_num_accumulates", {1});
99+
ctx->SetOutputDim("out_old_num_accumulates", {1});
100+
ctx->SetOutputDim("out_num_updates", {1});
101+
}
102+
103+
protected:
104+
framework::OpKernelType GetExpectedKernelType(
105+
const framework::ExecutionContext& ctx) const override {
106+
return framework::OpKernelType(
107+
framework::ToDataType(ctx.Input<Tensor>("param")->type()),
108+
ctx.GetPlace());
109+
}
110+
};
111+
112+
class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
113+
public:
114+
AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
115+
: OpProtoAndCheckerMaker(proto, op_checker) {
116+
AddInput("param", "(Tensor), The parameter to be accumulated.");
117+
AddInput("in_sum_1",
118+
"(Tensor), A tensor used to store the parameter "
119+
"sums with the same shape as input(param).");
120+
AddInput("in_sum_2",
121+
"(Tensor), A auxiliary tensor to help "
122+
"accumulating sums of parameter values with the same shape as "
123+
"input(param). It is used to avoid loss of precision due to too "
124+
"many sums.");
125+
AddInput("in_sum_3",
126+
"(Tensor), A auxiliary tensor to help "
127+
"accumulating sums of parameter values with the same shape as "
128+
"input(param).");
129+
AddInput("in_num_accumulates",
130+
"(Tensor<int64_t>), The accumulating times of current window with "
131+
"shape [1].");
132+
AddInput(
133+
"in_old_num_accumulates",
134+
"(Tensor<int64_t>), The accumulating times of previous window with "
135+
"shape [1].");
136+
AddInput("in_num_updates",
137+
"(Tensor<int64_t>), The total number of batches used by trainning "
138+
"before this batch with shape [1].");
139+
140+
AddOutput("out_sum_1",
141+
"(Tensor), A tensor used to store the "
142+
"parameter sums with the same shape as input(param).");
143+
AddOutput("out_sum_2",
144+
"(Tensor), A auxiliary tensor to help "
145+
"accumulating sums of parameter values with the same shape as "
146+
"input(param). It is used to avoid loss of precision due to too "
147+
"many sums.");
148+
AddOutput("out_sum_3",
149+
"(Tensor), A auxiliary tensor to help "
150+
"accumulating sums of parameter values with the same shape as "
151+
"input(param).");
152+
AddOutput(
153+
"out_num_accumulates",
154+
"(Tensor<int64_t>), The accumulating times of current window with "
155+
"shape [1].");
156+
AddOutput(
157+
"out_old_num_accumulates",
158+
"(Tensor<int64_t>) The accumulating times of previous window with "
159+
"shape [1].");
160+
AddOutput(
161+
"out_num_updates",
162+
"(Tensor<int64_t>), The total number of batches used by trainning "
163+
"before this batch with shape [1].");
164+
165+
AddAttr<float>("average_window",
166+
"(float, default 0) "
167+
"The rate of average window size relative to num_updates.")
168+
.SetDefault(0);
169+
AddAttr<int64_t>("max_average_window",
170+
"(int64_t) "
171+
"Maximum size of average window. It suggests that the "
172+
"number of mini-batches "
173+
"in one pass is appropriate value to set.");
174+
AddAttr<int64_t>("min_average_window",
175+
"(int64_t, default 10000L) "
176+
"Minimu size of average window.")
177+
.SetDefault(10000L);
178+
179+
AddComment(R"DOC(
180+
AverageAccumulates Operator.
181+
Accumulate the sum of parameter whtin sliding window. The size of sliding window is
182+
determined by 'average_window', 'max_average_window' and 'min_average_window'.
183+
Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
184+
'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
185+
186+
All the accumulators were inited to zero before training.
187+
188+
And for a mini-batch in training, accumulators were computed as below steps:
189+
num_updates += 1
190+
num_accumulates += 1
191+
sum_1 += param
192+
if num_updates % kMaxNumAccumulates == 0:
193+
sum_2 += sum_1
194+
sum_1 = 0
195+
if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
196+
sum_3 = sum_1 + sum_2
197+
sum_1 = 0
198+
sum_2 = 0
199+
old_num_accumulates = num_accumulates
200+
num_accumulates = 0
201+
202+
)DOC");
203+
}
204+
};
205+
206+
} // namespace operators
207+
} // namespace paddle
208+
209+
namespace ops = paddle::operators;
210+
REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
211+
ops::AverageAccumulatesOpMaker,
212+
paddle::framework::EmptyGradOpMaker);
213+
REGISTER_OP_CPU_KERNEL(
214+
average_accumulates,
215+
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
216+
ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#include "paddle/fluid/operators/average_accumulates_op.h"
16+
#include "paddle/fluid/platform/gpu_info.h"
17+
18+
namespace paddle {
19+
namespace operators {
20+
template <>
21+
void GetAccumulators<paddle::platform::CUDADeviceContext>(
22+
const framework::ExecutionContext& ctx, int64_t& num_updates_,
23+
int64_t& num_accumulates_, int64_t& old_num_accumulates_) {
24+
auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
25+
auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
26+
auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
27+
auto stream = ctx.cuda_device_context().stream();
28+
memory::Copy(platform::CPUPlace(), &old_num_accumulates_,
29+
platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
30+
sizeof(int64_t), stream);
31+
memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(),
32+
in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
33+
memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(),
34+
in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
35+
}
36+
37+
template <>
38+
void SetAccumulators<paddle::platform::CUDADeviceContext>(
39+
const framework::ExecutionContext& ctx, int64_t num_updates_,
40+
int64_t num_accumulates_, int64_t old_num_accumulates_) {
41+
auto stream = ctx.cuda_device_context().stream();
42+
auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
43+
auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
44+
auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
45+
46+
memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
47+
platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
48+
stream);
49+
memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
50+
platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
51+
stream);
52+
memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
53+
platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
54+
}
55+
56+
} // namespace operators
57+
} // namespace paddle
58+
59+
namespace ops = paddle::operators;
60+
REGISTER_OP_CUDA_KERNEL(
61+
average_accumulates,
62+
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
63+
ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
#include <algorithm>
17+
#include "paddle/fluid/framework/eigen.h"
18+
#include "paddle/fluid/framework/op_registry.h"
19+
#include "paddle/fluid/operators/math/math_function.h"
20+
21+
namespace paddle {
22+
namespace operators {
23+
24+
using Tensor = framework::Tensor;
25+
26+
template <typename T, int MajorType = Eigen::RowMajor,
27+
typename IndexType = Eigen::DenseIndex>
28+
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
29+
30+
template <typename DeviceContext>
31+
void GetAccumulators(const framework::ExecutionContext& ctx,
32+
int64_t& num_updates, int64_t& num_accumulates,
33+
int64_t& old_num_accumulates);
34+
35+
template <typename DeviceContext>
36+
void SetAccumulators(const framework::ExecutionContext& ctx,
37+
int64_t num_updates, int64_t num_accumulates,
38+
int64_t old_num_accumulates);
39+
40+
template <typename DeviceContext, typename T>
41+
class AverageAccumulatesKernel : public framework::OpKernel<T> {
42+
public:
43+
void Compute(const framework::ExecutionContext& ctx) const override {
44+
// It is used to avoid loss of precision
45+
static const int64_t kMaxNumAccumulates = 16384;
46+
// Get accumulators from input
47+
int64_t num_updates = 0;
48+
int64_t num_accumulates = 0;
49+
int64_t old_num_accumulates = 0;
50+
GetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
51+
old_num_accumulates);
52+
53+
// Get attrs
54+
float average_window = ctx.Attr<float>("average_window");
55+
int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
56+
int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
57+
min_average_window =
58+
std::min<int64_t>(min_average_window, max_average_window);
59+
60+
// Get inputs
61+
auto* param = ctx.Input<Tensor>("param");
62+
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
63+
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
64+
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
65+
auto param_tensor = EigenVector<T>::Flatten(*param);
66+
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
67+
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
68+
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
69+
70+
// Get outputs
71+
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
72+
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
73+
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
74+
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
75+
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
76+
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
77+
78+
// Compute
79+
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
80+
math::SetConstant<DeviceContext, T> constant_functor;
81+
++num_updates;
82+
++num_accumulates;
83+
out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
84+
out_sum_2_tensor.device(place) = in_sum_2_tensor;
85+
out_sum_3_tensor.device(place) = in_sum_3_tensor;
86+
if (num_updates % kMaxNumAccumulates == 0) {
87+
// Move the sum to a different buffer to avoid loss of precision due to
88+
// too many sums.
89+
out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
90+
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
91+
0.0);
92+
}
93+
if (num_accumulates >= min_average_window &&
94+
num_accumulates >= std::min<int64_t>(max_average_window,
95+
num_updates * average_window)) {
96+
// Now the average window is too long, discard the old sum.
97+
out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
98+
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
99+
0.0);
100+
constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
101+
0.0);
102+
old_num_accumulates = num_accumulates;
103+
num_accumulates = 0;
104+
}
105+
106+
// Set accumulators to output
107+
SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
108+
old_num_accumulates);
109+
}
110+
};
111+
112+
} // namespace operators
113+
} // namespace paddle

0 commit comments

Comments
 (0)