Skip to content

Commit 40564c3

Browse files
authored
Merge pull request #72 from NVIDIA/ptq_stabilization
Ptq stabilization
2 parents e6f598f + de3cbc4 commit 40564c3

File tree

11 files changed

+228
-82
lines changed

11 files changed

+228
-82
lines changed

cpp/api/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ cc_library(
1212
"src/extra_info.cpp",
1313
"src/logging.cpp",
1414
"src/trtorch.cpp",
15+
"src/ptq.cpp"
1516
],
1617
deps = [
1718
"//core",

cpp/api/include/trtorch/ptq.h

Lines changed: 26 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,21 @@
66
#include <iostream>
77
#include <sstream>
88

9+
#include "trtorch/logging.h"
10+
911
#ifndef DOXYGEN_SHOULD_SKIP_THIS
1012
namespace nvinfer1 {
1113
class IInt8Calibrator;
1214
class IInt8EntropyCalibrator2;
1315
}
1416

1517
namespace torch {
16-
namespace data {
17-
template<typename Example>
18-
class Iterator;
18+
class Tensor;
19+
}
20+
21+
namespace trtorch {
22+
namespace ptq {
23+
bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data);
1924
}
2025
}
2126
#endif //DOXYGEN_SHOULD_SKIP_THIS
@@ -45,7 +50,12 @@ class Int8Calibrator : Algorithm {
4550
* @param use_cache : bool - Whether to use the cache (if it exists)
4651
*/
4752
Int8Calibrator(DataLoaderUniquePtr dataloader, const std::string& cache_file_path, bool use_cache)
48-
: dataloader_(dataloader.get()), it_(dataloader_->end()), cache_file_path_(cache_file_path), use_cache_(use_cache) {}
53+
: dataloader_(dataloader.get()), cache_file_path_(cache_file_path), use_cache_(use_cache) {
54+
for (auto batch : *dataloader_) {
55+
batched_data_.push_back(batch.data);
56+
}
57+
it_ = batched_data_.begin();
58+
}
4959

5060
/**
5161
* @brief Get the Batch Size for the next batch (always 1 due to issues with TRT and explicit batch)
@@ -70,26 +80,15 @@ class Int8Calibrator : Algorithm {
7080
* @return false - There is not a new batch for the calibrator to consume
7181
*/
7282
bool getBatch(void* bindings[], const char* names[], int nbBindings) override {
73-
// HACK: doesnt seem like the first try in the initializer list works
74-
if (! it_created_) {
75-
it_ = dataloader_->begin();
76-
it_created_ = true;
77-
}
78-
79-
if (it_ == dataloader_->end()) {
83+
if (it_ != batched_data_.end()) {
84+
auto status = get_batch_impl(bindings, names, nbBindings, *it_);
85+
it_ = ++it_;
86+
return status;
87+
} else {
88+
// Reset iterator if incase calibrator is going to be used again
89+
it_ = batched_data_.begin();
8090
return false;
8191
}
82-
83-
auto batch = *it_;
84-
85-
for (int i = 0; i < nbBindings; i++) {
86-
auto data = batch.data;
87-
data = data.to(at::kCUDA).contiguous();
88-
bindings[i] = data.data_ptr();
89-
}
90-
91-
it_ = ++it_;
92-
return true;
9392
}
9493

9594
/**
@@ -151,8 +150,6 @@ class Int8Calibrator : Algorithm {
151150
private:
152151
/// Pointer to the dataloader
153152
DataLoader* dataloader_;
154-
/// Iterator used to traverse the dataloader
155-
torch::data::Iterator<Batch> it_;
156153
/// Path to cache file
157154
const std::string& cache_file_path_;
158155
/// Size of cache
@@ -161,10 +158,11 @@ class Int8Calibrator : Algorithm {
161158
bool use_cache_;
162159
/// Cache data
163160
std::vector<char> cache_;
164-
/// If the iterator has been created, DataLoaders can only have 1 live iterator,
165-
/// due to some issues this cannot be created at construction, so it is set in the first
166-
/// batch, controlled by this flag
167-
bool it_created_ = false;
161+
/// Batched Data
162+
std::vector<torch::Tensor> batched_data_;
163+
/// Iterator to move through dataset
164+
std::vector<torch::Tensor>::iterator it_;
165+
168166
};
169167

170168
/**

cpp/api/src/ptq.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#include "torch/torch.h"
2+
#include "trtorch/ptq.h"
3+
4+
namespace trtorch {
5+
namespace ptq {
6+
7+
bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data) {
8+
for (int i = 0; i < nbBindings; i++) {
9+
data = data.to(at::kCUDA).contiguous();
10+
bindings[i] = data.data_ptr();
11+
}
12+
return true;
13+
}
14+
15+
} // namespace ptq
16+
} // namespace trtorch

cpp/benchmark/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Benchmarking
22

3-
This is a quick benchmarking application for TRTorch. It lets you run supported TorchScript modules both in JIT and TRT and returns the average runtime and throughput.
3+
This is a quick benchmarking application for TRTorch. It lets you run supported TorchScript modules both in JIT and TRT and returns the average runtime and throughput.
44

55
## Compilation / Usage
66

@@ -20,12 +20,14 @@ bazel run //cpp/benchmark --cxxopt="-DNDEBUG" --cxxopt="-DJIT" --cxxopt="-DTRT"
2020

2121
### Options
2222

23-
You can run a module with JIT or TRT via TRTorch in either FP32 or FP16. These options are controlled by preprocessor directives.
23+
You can run a module with JIT or TRT via TRTorch in either FP32 or FP16. These options are controlled by preprocessor directives.
2424

2525
- To enable JIT profiling, add the argument `--cxxopt="-DJIT"`
2626

2727
- To enable TRT profiling, add the argument `--cxxopt="-DTRT"`
2828

2929
- To enable FP16 execution, add the argument `--cxxopt="-DHALF"`
3030

31+
- To also save the TRT engine, add the argument `--cxxopt="-DSAVE_ENGINE"`
32+
3133
> It's suggested to also define `--cxxopt="-DNDEBUG"` to supress debug information

cpp/benchmark/main.cpp

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,6 @@ int main(int argc, const char* argv[]) {
105105

106106
mod.to(at::kCUDA);
107107

108-
#ifdef HALF
109-
mod.to(torch::kHalf);
110-
for (auto layer : mod.named_modules()) {
111-
if (layer.name.find(".bn") != std::string::npos) {
112-
layer.value.to(torch::kFloat);
113-
}
114-
}
115-
#endif
116-
117108
std::vector<std::vector<int64_t>> dims;
118109
for (int i = 2; i < argc; i++) {
119110
auto arg = std::string(argv[i]);
@@ -129,23 +120,42 @@ int main(int argc, const char* argv[]) {
129120

130121
at::globalContext().setBenchmarkCuDNN(true);
131122

132-
#ifdef JIT
133-
auto jit_runtimes = benchmark_module(mod, dims[0]);
134-
print_avg_std_dev("JIT", jit_runtimes, dims[0][0]);
135-
#endif
136-
137123
#ifdef TRT
138124
auto extra_info = trtorch::ExtraInfo(dims);
139-
extra_info.workspace_size = 1 << 24;
125+
extra_info.workspace_size = 1 << 20;
140126

141127
#ifdef HALF
142-
extra_info.op_precision = at::kHalf;
128+
extra_info.op_precision = torch::kF16;
143129
#endif
144130

145131
auto trt_mod = trtorch::CompileGraph(mod, extra_info);
132+
133+
#ifdef SAVE_ENGINE
134+
std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;
135+
auto engine = trtorch::ConvertGraphToTRTEngine(mod, "forward", extra_info);
136+
std::ofstream out("/tmp/engine_converted_from_jit.trt");
137+
out << engine;
138+
out.close();
139+
#endif
140+
146141
auto trt_runtimes = benchmark_module(trt_mod, dims[0]);
147142
print_avg_std_dev("JIT/TRT", trt_runtimes, dims[0][0]);
148143
#endif
149144

145+
146+
#ifdef HALF
147+
mod.to(torch::kHalf);
148+
for (auto layer : mod.named_modules()) {
149+
if (layer.name.find(".bn") != std::string::npos) {
150+
layer.value.to(torch::kFloat);
151+
}
152+
}
153+
#endif
154+
155+
#ifdef JIT
156+
auto jit_runtimes = benchmark_module(mod, dims[0]);
157+
print_avg_std_dev("JIT", jit_runtimes, dims[0][0]);
158+
#endif
159+
150160
std::cout << "ok\n";
151161
}

cpp/ptq/BUILD

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ cc_binary(
44
name = "ptq",
55
srcs = [
66
"main.cpp",
7-
"timer.h"
87
],
98
deps = [
9+
"//cpp/ptq/benchmark",
1010
"//cpp/ptq/datasets:cifar10",
1111
"@libtorch//:libtorch",
1212
"@libtorch//:caffe2",

cpp/ptq/benchmark/BUILD

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package(default_visibility = ["//visibility:public"])
2+
3+
cc_library(
4+
name = "benchmark",
5+
hdrs = [
6+
"benchmark.h"
7+
],
8+
srcs = [
9+
"benchmark.cpp",
10+
"timer.h"
11+
],
12+
deps = [
13+
"@libtorch//:libtorch",
14+
"@libtorch//:caffe2",
15+
"//cpp/api:trtorch"
16+
],
17+
)

cpp/ptq/benchmark/benchmark.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#include "torch/script.h"
2+
#include "torch/torch.h"
3+
#include "ATen/Context.h"
4+
#include "c10/cuda/CUDACachingAllocator.h"
5+
#include "trtorch/trtorch.h"
6+
#include "cuda_runtime_api.h"
7+
8+
#include "timer.h"
9+
10+
#define NUM_WARMUP_RUNS 20
11+
#define NUM_RUNS 100
12+
13+
// Benchmaking code
14+
void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size) {
15+
float avg_runtime = std::accumulate(runtimes.begin(), runtimes.end(), 0.0) / runtimes.size();
16+
float fps = (1000.f / avg_runtime) * batch_size;
17+
std::cout << "[" << type << "]: batch_size: " << batch_size << "\n Average latency: " << avg_runtime << " ms\n Average FPS: " << fps << " fps" <<std::endl;
18+
19+
std::vector<float> rt_diff(runtimes.size());
20+
std::transform(runtimes.begin(), runtimes.end(), rt_diff.begin(), [avg_runtime](float x) { return x - avg_runtime; });
21+
float rt_sq_sum = std::inner_product(rt_diff.begin(), rt_diff.end(), rt_diff.begin(), 0.0);
22+
float rt_std_dev = std::sqrt(rt_sq_sum / runtimes.size());
23+
24+
std::vector<float> fps_diff(runtimes.size());
25+
std::transform(runtimes.begin(), runtimes.end(), fps_diff.begin(), [fps, batch_size](float x) { return ((1000.f / x) * batch_size) - fps; });
26+
float fps_sq_sum = std::inner_product(fps_diff.begin(), fps_diff.end(), fps_diff.begin(), 0.0);
27+
float fps_std_dev = std::sqrt(fps_sq_sum / runtimes.size());
28+
std::cout << " Latency Standard Deviation: " << rt_std_dev << "\n FPS Standard Deviation: " << fps_std_dev << "\n(excluding initial warmup runs)" << std::endl;
29+
}
30+
31+
std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape) {
32+
auto execution_timer = timers::PreciseCPUTimer();
33+
std::vector<float> execution_runtimes;
34+
35+
for (uint64_t i = 0; i < NUM_WARMUP_RUNS; i++) {
36+
std::vector<torch::jit::IValue> inputs_ivalues;
37+
auto in = at::rand(shape, {at::kCUDA});
38+
#ifdef HALF
39+
in = in.to(torch::kHalf);
40+
#endif
41+
inputs_ivalues.push_back(in.clone());
42+
43+
cudaDeviceSynchronize();
44+
mod.forward(inputs_ivalues);
45+
cudaDeviceSynchronize();
46+
47+
}
48+
49+
for (uint64_t i = 0; i < NUM_RUNS; i++) {
50+
std::vector<torch::jit::IValue> inputs_ivalues;
51+
auto in = at::rand(shape, {at::kCUDA});
52+
#ifdef HALF
53+
in = in.to(torch::kHalf);
54+
#endif
55+
inputs_ivalues.push_back(in.clone());
56+
cudaDeviceSynchronize();
57+
58+
execution_timer.start();
59+
mod.forward(inputs_ivalues);
60+
cudaDeviceSynchronize();
61+
execution_timer.stop();
62+
63+
auto time = execution_timer.milliseconds();
64+
execution_timer.reset();
65+
execution_runtimes.push_back(time);
66+
67+
c10::cuda::CUDACachingAllocator::emptyCache();
68+
}
69+
return execution_runtimes;
70+
}

cpp/ptq/benchmark/benchmark.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#pragma once
2+
3+
void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size);
4+
std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape);
File renamed without changes.

0 commit comments

Comments
 (0)