Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 68 additions & 6 deletions paddle/fluid/framework/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,29 @@ limitations under the License. */
namespace paddle {
namespace framework {

DEFINE_string(devices, "", "The devices to be used.");
DEFINE_bool(init_p2p, true, "Whether to init p2p.");

std::once_flag gflags_init_flag;
std::once_flag p2p_init_flag;

using paddle::platform::DeviceContextPool;

void Init(std::vector<std::string> argv) {
InitGflags(argv);
// init devices
std::vector<int> devices;
std::string token;
std::istringstream tokenStream(FLAGS_devices);
while (std::getline(tokenStream, token, ',')) {
devices.push_back(std::stoi(token));
}
InitDevices(FLAGS_init_p2p, devices);
}

void InitGflags(std::vector<std::string> argv) {
std::call_once(gflags_init_flag, [&]() {
argv.push_back("dummy");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里是不是应该插入到argv[0]之前?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是的,感谢提醒。

int argc = argv.size();
char **arr = new char *[argv.size()];
std::string line;
Expand Down Expand Up @@ -65,13 +83,52 @@ void InitP2P(int count) {
#endif
}

void InitP2P(std::vector<int> devices) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

line67行的InitP2P函数,可以直接调用该函数吧

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Thx.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我的意思是:void InitP2P(int count) 和 你新加的 InitP2P(std::vector<int> devices) 内部实现几乎一样,可以先把 std::vector<int> devices构造出来,直接调用新增的这个。 或者上面那个还有必要存在吗?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qingqing01 明白了,我改成在InitDevices(bool initP2P) 里调用InitDevices(bool initP2P, vector<int> devices), 然后void InitP2P(int count)也就没用了,已将其删除。

#ifdef PADDLE_WITH_CUDA
std::call_once(p2p_init_flag, [&]() {
int count = devices.size();
for (int i = 0; i < count; ++i) {
for (int j = 0; j < count; ++j) {
if (devices[i] == devices[j]) continue;
int can_acess = -1;
PADDLE_ENFORCE(
cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]),
"Failed to test P2P access.");
if (can_acess != 1) {
LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
<< " to " << devices[j];
} else {
cudaSetDevice(devices[i]);
cudaDeviceEnablePeerAccess(devices[j], 0);
}
}
}
});
#endif
}

void InitDevices(bool init_p2p) {
/*Init all available devices by default */
/*Init all available devices by default */
#ifdef PADDLE_WITH_CUDA
std::vector<int> devices;
try {
int count = platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
}
#else
LOG(WARNING)
<< "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif
InitDevices(init_p2p, devices);
}

void InitDevices(bool init_p2p, const std::vector<int> devices) {
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
int count = 0;

#ifdef PADDLE_WITH_CUDA
try {
count = platform::GetCUDADeviceCount();
Expand All @@ -83,12 +140,17 @@ void InitDevices(bool init_p2p) {
<< "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif

for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
for (size_t i = 0; i < devices.size(); ++i) {
if (devices[i] >= count || devices[i] < 0) {
LOG(WARNING) << "Invalid devices id.";
continue;
}
places.emplace_back(platform::CUDAPlace(devices[i]));
}
if (init_p2p) {
InitP2P(count);
InitP2P(devices);
}
places.emplace_back(platform::CPUPlace());
platform::DeviceContextPool::Init(places);
}

Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/framework/init.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,15 @@ limitations under the License. */
namespace paddle {
namespace framework {

void Init(std::vector<std::string> argv);

void InitGflags(std::vector<std::string> argv);

void InitGLOG(const std::string &prog_name);

void InitDevices(bool init_p2p);

void InitDevices(bool init_p2p, const std::vector<int> devices);

} // namespace framework
} // namespace paddle
4 changes: 1 addition & 3 deletions paddle/fluid/inference/io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@ limitations under the License. */
namespace paddle {
namespace inference {

// Temporarily add this function for exposing framework::InitDevices() when
// linking the inference shared library.
void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
void Init(const std::vector<std::string> argv) { framework::Init(argv); }

void ReadBinaryFile(const std::string& filename, std::string* contents) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/inference/io.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ limitations under the License. */
namespace paddle {
namespace inference {

void Init(bool init_p2p);
void Init(const std::vector<std::string> argv);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why there are two Init interfaces, one in paddle/fluid/inference/io.h, another in paddle/fluid/framework/init.h ?

Copy link
Contributor Author

@wanghaoshuang wanghaoshuang Apr 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

paddle/fluid/framework/init.h中的init确实多余,也没有被其它地方用到,已经将其删除。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qingqing01 @wanghaoshuang

// Temporarily add this function for exposing framework::InitDevices() when
// linking the inference shared library.
void Init(bool init_p2p) { framework::InitDevices(init_p2p); }

这里有注释。因为当前libpaddle_fluid.so不再使用whole-archive链接,而framework/init.h中的函数,没有被Fluid其他的C++代码调用到,在链接生成libpaddle_fluid.so的时候,framework/init.h里面的符号就没有链接进来。用户inference代码里面需要显式调用paddle::framework::InitDevices,在使用libpaddle_fluid.so的时候,会出现undefined symbols paddle::framework::InitDevices的错误。


void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program,
Expand Down