Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions paddle/fluid/inference/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ limitations under the License. */
namespace paddle {
namespace inference {

struct Buffer;
enum class DeviceType { UNK = -1, CPU, GPU };

/*
* EngineBase is the base class of all inference engines. An inference engine
* takes a paddle program as input, and outputs the result in fluid Tensor
Expand All @@ -45,8 +48,20 @@ class EngineBase {
// Execute the engine, that will run the inference network.
virtual void Execute(int batch_size) = 0;

// Return the IO buffer that allocated in engine. One can read/write directly
// on the buffer. If the buffer's buffer is nullptr, one can also allocate
// memory and maintain it outside the engine.
virtual Buffer& buffer(const std::string& tensor) = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tensor->name, 这里参数指的是buffer名字吧


virtual ~EngineBase() {}
}; // class EngineBase

struct Buffer {
void* buffer{nullptr}; // buffer should be allocated only once.
int max_size; // buffer allocated space.
int size; // data size.
DeviceType device{DeviceType::UNK}; // tells which device this buffer is on.
};

} // namespace inference
} // namespace paddle
5 changes: 3 additions & 2 deletions paddle/fluid/inference/tensorrt/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
nv_library(tensorrt_engine SRCS engine.h engine.cc helper.h ../engine.h)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nv_library(tensorrt_engine SRCS engine.cc)

nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
nv_test(test_tensorrt_engine SRCS test_engine.cc engine.cc DEPS dynload_cuda)
set(ENGINE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/engine.cc)
nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)

add_subdirectory(convert)
4 changes: 2 additions & 2 deletions paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
nv_test(test_trt_activation_op SRCS test_activation_op.cc ${ENGINE_FILE} activation_op.cc
DEPS ${FLUID_CORE_MODULES} activation_op)
nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine)
nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
44 changes: 30 additions & 14 deletions paddle/fluid/inference/tensorrt/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,24 @@ void TensorRTEngine::Build(const DescType& paddle_model) {
}

void TensorRTEngine::Execute(int batch_size) {
infer_context_->enqueue(batch_size, buffers_.data(), *stream_, nullptr);
std::vector<void*> buffers;
for (auto& buf : buffers_) {
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
PADDLE_ENFORCE_GT(buf.max_size, 0);
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buffers.push_back(buf.buffer);
}
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_);
}

TensorRTEngine::~TensorRTEngine() {
// clean buffer
for (auto& buffer : buffers_) {
if (buffer != nullptr) {
PADDLE_ENFORCE_EQ(0, cudaFree(buffer));
buffer = nullptr;
for (auto& buf : buffers_) {
if (buf.buffer != nullptr) {
PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
buf.buffer = nullptr;
buf.max_size = 0;
}
}
}
Expand All @@ -59,15 +67,19 @@ void TensorRTEngine::FreezeNetwork() {
infer_context_.reset(infer_engine_->createExecutionContext());

// allocate GPU buffers.
buffers_.resize(buffer_sizes_.size(), nullptr);
buffers_.resize(buffer_sizes_.size());
for (auto& item : buffer_sizes_) {
if (item.second == 0) {
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
AccumDims(infer_engine_->getBindingDimensions(slot_offset));
}
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buffer(item.first), item.second));
auto& buf = buffer(item.first);
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
buf.size = buf.max_size = item.second;
buf.device = DeviceType::GPU;
}
}

Expand Down Expand Up @@ -113,7 +125,7 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
}

void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
return buffer(name);
return buffer(name).buffer;
}

void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
Expand All @@ -123,11 +135,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second);
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buffer(name), it->second,
auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
cudaMemcpyDeviceToHost, *stream_));
}

void*& TensorRTEngine::buffer(const std::string& name) {
Buffer& TensorRTEngine::buffer(const std::string& name) {
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
Expand All @@ -137,10 +151,12 @@ void*& TensorRTEngine::buffer(const std::string& name) {

void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
size_t size) {
void* buf = buffer(name);
cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_);
PADDLE_ENFORCE_EQ(
0, cudaMemcpyAsync(buf, data, size, cudaMemcpyHostToDevice, *stream_));
auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyHostToDevice, *stream_));
}

void TensorRTEngine::SetITensor(const std::string& name,
Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/inference/tensorrt/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ class TensorRTEngine : public EngineBase {
// these memory directly for acceleration, for example, output the converted
// data directly to the buffer to save data copy overhead.
// NOTE this should be used after calling `FreezeNetwork`.
void*& buffer(const std::string& name);
Buffer& buffer(const std::string& tensor) override;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tensor->name


cudaStream_t* stream() { return stream_; }

// Fill an input from CPU memory with name and size.
void SetInputFromCPU(const std::string& name, void* data, size_t size);
Expand Down Expand Up @@ -116,7 +118,7 @@ class TensorRTEngine : public EngineBase {
cudaStream_t* stream_;
nvinfer1::ILogger& logger_;

std::vector<void*> buffers_;
std::vector<Buffer> buffers_;
// max data size for the buffers.
std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
Expand Down