Merge pull request #72 from NVIDIA/ptq_stabilization

narendasan · web-flow · commit 40564c3a9ab9 · 2020-05-27T20:04:39.000-06:00
Ptq stabilization
diff --git a/cpp/api/BUILD b/cpp/api/BUILD
@@ -12,6 +12,7 @@ cc_library(
         "src/extra_info.cpp",
         "src/logging.cpp",
         "src/trtorch.cpp",
+        "src/ptq.cpp"
     ],
     deps = [
         "//core",
diff --git a/cpp/api/include/trtorch/ptq.h b/cpp/api/include/trtorch/ptq.h
@@ -6,16 +6,21 @@
 #include <iostream>
 #include <sstream>
 
+#include "trtorch/logging.h"
+
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace nvinfer1 {
 class IInt8Calibrator;
 class IInt8EntropyCalibrator2;
 }
 
 namespace torch {
-namespace data {
-template<typename Example>
-class Iterator;
+class Tensor;
+}
+
+namespace trtorch {
+namespace ptq {
+bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data);
 }
 }
 #endif //DOXYGEN_SHOULD_SKIP_THIS
@@ -45,7 +50,12 @@ class Int8Calibrator : Algorithm {
      * @param use_cache : bool - Whether to use the cache (if it exists)
      */
     Int8Calibrator(DataLoaderUniquePtr dataloader, const std::string& cache_file_path, bool use_cache)
-      : dataloader_(dataloader.get()), it_(dataloader_->end()), cache_file_path_(cache_file_path), use_cache_(use_cache) {}
+      : dataloader_(dataloader.get()), cache_file_path_(cache_file_path), use_cache_(use_cache) {
+          for (auto batch : *dataloader_) {
+            batched_data_.push_back(batch.data);
+          }
+          it_ = batched_data_.begin();
+      }
 
     /**
      * @brief Get the Batch Size for the next batch (always 1 due to issues with TRT and explicit batch)
@@ -70,26 +80,15 @@ class Int8Calibrator : Algorithm {
      * @return false - There is not a new batch for the calibrator to consume
      */
     bool getBatch(void* bindings[], const char* names[], int nbBindings) override {
-        // HACK: doesnt seem like the first try in the initializer list works
-        if (! it_created_) {
-            it_ = dataloader_->begin();
-            it_created_ = true;
-        }
-
-        if (it_ == dataloader_->end()) {
+        if (it_ != batched_data_.end()) {
+            auto status = get_batch_impl(bindings, names, nbBindings, *it_);
+            it_ = ++it_;
+            return status;
+        } else {
+            // Reset iterator if incase calibrator is going to be used again
+            it_ = batched_data_.begin();
             return false;
         }
-
-        auto batch = *it_;
-
-        for (int i = 0; i < nbBindings; i++) {
-            auto data = batch.data;
-            data = data.to(at::kCUDA).contiguous();
-            bindings[i] = data.data_ptr();
-        }
-
-        it_ = ++it_;
-        return true;
     }
 
     /**
@@ -151,8 +150,6 @@ class Int8Calibrator : Algorithm {
 private:
     /// Pointer to the dataloader
     DataLoader* dataloader_;
-    /// Iterator used to traverse the dataloader
-    torch::data::Iterator<Batch> it_;
     /// Path to cache file
     const std::string& cache_file_path_;
     /// Size of cache
@@ -161,10 +158,11 @@ class Int8Calibrator : Algorithm {
     bool use_cache_;
     /// Cache data
     std::vector<char> cache_;
-    /// If the iterator has been created, DataLoaders can only have 1 live iterator,
-    /// due to some issues this cannot be created at construction, so it is set in the first
-    /// batch, controlled by this flag
-    bool it_created_ = false;
+    /// Batched Data
+    std::vector<torch::Tensor> batched_data_;
+    /// Iterator to move through dataset
+    std::vector<torch::Tensor>::iterator it_;
+
 };
 
 /**
diff --git a/cpp/api/src/ptq.cpp b/cpp/api/src/ptq.cpp
@@ -0,0 +1,16 @@
+#include "torch/torch.h"
+#include "trtorch/ptq.h"
+
+namespace trtorch {
+namespace ptq {
+
+bool get_batch_impl(void* bindings[], const char* names[], int nbBindings, torch::Tensor& data) {
+    for (int i = 0; i < nbBindings; i++) {
+        data = data.to(at::kCUDA).contiguous();
+        bindings[i] = data.data_ptr();
+    }
+    return true;
+}
+
+} // namespace ptq
+} // namespace trtorch
diff --git a/cpp/benchmark/README.md b/cpp/benchmark/README.md
@@ -1,6 +1,6 @@
 # Benchmarking
 
-This is a quick benchmarking application for TRTorch. It lets you run supported TorchScript modules both in JIT and TRT and returns the average runtime and throughput. 
+This is a quick benchmarking application for TRTorch. It lets you run supported TorchScript modules both in JIT and TRT and returns the average runtime and throughput.
 
 ## Compilation / Usage
 
@@ -20,12 +20,14 @@ bazel run //cpp/benchmark  --cxxopt="-DNDEBUG" --cxxopt="-DJIT" --cxxopt="-DTRT"
 
 ### Options
 
-You can run a module with JIT or TRT via TRTorch in either FP32 or FP16. These options are controlled by preprocessor directives. 
+You can run a module with JIT or TRT via TRTorch in either FP32 or FP16. These options are controlled by preprocessor directives.
 
 - To enable JIT profiling, add the argument `--cxxopt="-DJIT"`
 
 - To enable TRT profiling, add the argument `--cxxopt="-DTRT"`
 
 - To enable FP16 execution, add the argument `--cxxopt="-DHALF"`
 
+- To also save the TRT engine, add the argument `--cxxopt="-DSAVE_ENGINE"`
+
 > It's suggested to also define `--cxxopt="-DNDEBUG"` to supress debug information
diff --git a/cpp/benchmark/main.cpp b/cpp/benchmark/main.cpp
@@ -105,15 +105,6 @@ int main(int argc, const char* argv[]) {
 
     mod.to(at::kCUDA);
 
-#ifdef HALF
-    mod.to(torch::kHalf);
-    for (auto layer : mod.named_modules()) {
-        if (layer.name.find(".bn") != std::string::npos) {
-            layer.value.to(torch::kFloat);
-        }
-    }
-#endif
-
     std::vector<std::vector<int64_t>> dims;
     for (int i = 2; i < argc; i++) {
         auto arg = std::string(argv[i]);
@@ -129,23 +120,42 @@ int main(int argc, const char* argv[]) {
 
     at::globalContext().setBenchmarkCuDNN(true);
 
-#ifdef JIT
-    auto jit_runtimes = benchmark_module(mod, dims[0]);
-    print_avg_std_dev("JIT", jit_runtimes, dims[0][0]);
-#endif
-
 #ifdef TRT
     auto extra_info = trtorch::ExtraInfo(dims);
-    extra_info.workspace_size = 1 << 24;
+    extra_info.workspace_size = 1 << 20;
 
 #ifdef HALF
-    extra_info.op_precision = at::kHalf;
+    extra_info.op_precision = torch::kF16;
 #endif
 
     auto trt_mod = trtorch::CompileGraph(mod, extra_info);
+
+#ifdef SAVE_ENGINE
+    std::cout << "Compiling graph to save as TRT engine (/tmp/engine_converted_from_jit.trt)" << std::endl;
+    auto engine = trtorch::ConvertGraphToTRTEngine(mod, "forward", extra_info);
+    std::ofstream out("/tmp/engine_converted_from_jit.trt");
+    out << engine;
+    out.close();
+#endif
+
     auto trt_runtimes = benchmark_module(trt_mod, dims[0]);
     print_avg_std_dev("JIT/TRT", trt_runtimes, dims[0][0]);
 #endif
 
+
+#ifdef HALF
+    mod.to(torch::kHalf);
+    for (auto layer : mod.named_modules()) {
+        if (layer.name.find(".bn") != std::string::npos) {
+            layer.value.to(torch::kFloat);
+        }
+    }
+#endif
+
+#ifdef JIT
+    auto jit_runtimes = benchmark_module(mod, dims[0]);
+    print_avg_std_dev("JIT", jit_runtimes, dims[0][0]);
+#endif
+
     std::cout << "ok\n";
 }
diff --git a/cpp/ptq/BUILD b/cpp/ptq/BUILD
@@ -4,9 +4,9 @@ cc_binary(
     name = "ptq",
     srcs = [
         "main.cpp",
-        "timer.h"
     ],
     deps = [
+        "//cpp/ptq/benchmark",
         "//cpp/ptq/datasets:cifar10",
         "@libtorch//:libtorch",
         "@libtorch//:caffe2",
diff --git a/cpp/ptq/benchmark/BUILD b/cpp/ptq/benchmark/BUILD
@@ -0,0 +1,17 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "benchmark",
+    hdrs = [
+        "benchmark.h"
+    ],
+    srcs = [
+        "benchmark.cpp",
+        "timer.h"
+    ],
+    deps = [
+        "@libtorch//:libtorch",
+        "@libtorch//:caffe2",
+        "//cpp/api:trtorch"
+    ],
+)
diff --git a/cpp/ptq/benchmark/benchmark.cpp b/cpp/ptq/benchmark/benchmark.cpp
@@ -0,0 +1,70 @@
+#include "torch/script.h"
+#include "torch/torch.h"
+#include "ATen/Context.h"
+#include "c10/cuda/CUDACachingAllocator.h"
+#include "trtorch/trtorch.h"
+#include "cuda_runtime_api.h"
+
+#include "timer.h"
+
+#define NUM_WARMUP_RUNS 20
+#define NUM_RUNS 100
+
+// Benchmaking code
+void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size) {
+    float avg_runtime = std::accumulate(runtimes.begin(), runtimes.end(), 0.0) / runtimes.size();
+    float fps = (1000.f / avg_runtime) * batch_size;
+    std::cout << "[" << type << "]: batch_size: " << batch_size << "\n    Average latency: " << avg_runtime << " ms\n    Average FPS: " << fps << " fps" <<std::endl;
+
+    std::vector<float> rt_diff(runtimes.size());
+    std::transform(runtimes.begin(), runtimes.end(), rt_diff.begin(), [avg_runtime](float x) { return x - avg_runtime; });
+    float rt_sq_sum = std::inner_product(rt_diff.begin(), rt_diff.end(), rt_diff.begin(), 0.0);
+    float rt_std_dev = std::sqrt(rt_sq_sum / runtimes.size());
+
+    std::vector<float> fps_diff(runtimes.size());
+    std::transform(runtimes.begin(), runtimes.end(), fps_diff.begin(), [fps, batch_size](float x) { return ((1000.f / x) * batch_size) - fps; });
+    float fps_sq_sum = std::inner_product(fps_diff.begin(), fps_diff.end(), fps_diff.begin(), 0.0);
+    float fps_std_dev = std::sqrt(fps_sq_sum / runtimes.size());
+    std::cout << "    Latency Standard Deviation: " << rt_std_dev  << "\n    FPS Standard Deviation: " << fps_std_dev  << "\n(excluding initial warmup runs)" << std::endl;
+}
+
+std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape) {
+    auto execution_timer = timers::PreciseCPUTimer();
+    std::vector<float> execution_runtimes;
+
+    for (uint64_t i = 0; i < NUM_WARMUP_RUNS; i++) {
+        std::vector<torch::jit::IValue> inputs_ivalues;
+        auto in = at::rand(shape, {at::kCUDA});
+#ifdef HALF
+        in = in.to(torch::kHalf);
+#endif
+        inputs_ivalues.push_back(in.clone());
+
+        cudaDeviceSynchronize();
+        mod.forward(inputs_ivalues);
+        cudaDeviceSynchronize();
+
+    }
+
+    for (uint64_t i = 0; i < NUM_RUNS; i++) {
+        std::vector<torch::jit::IValue> inputs_ivalues;
+        auto in = at::rand(shape, {at::kCUDA});
+#ifdef HALF
+        in = in.to(torch::kHalf);
+#endif
+        inputs_ivalues.push_back(in.clone());
+        cudaDeviceSynchronize();
+
+        execution_timer.start();
+        mod.forward(inputs_ivalues);
+        cudaDeviceSynchronize();
+        execution_timer.stop();
+
+        auto time = execution_timer.milliseconds();
+        execution_timer.reset();
+        execution_runtimes.push_back(time);
+
+        c10::cuda::CUDACachingAllocator::emptyCache();
+    }
+    return execution_runtimes;
+}
diff --git a/cpp/ptq/benchmark/benchmark.h b/cpp/ptq/benchmark/benchmark.h
@@ -0,0 +1,4 @@
+#pragma once
+
+void print_avg_std_dev(std::string type, std::vector<float>& runtimes, uint64_t batch_size);
+std::vector<float> benchmark_module(torch::jit::script::Module& mod, std::vector<int64_t> shape);
diff --git a/cpp/ptq/benchmark/timer.h b/cpp/ptq/benchmark/timer.h
diff --git a/cpp/ptq/main.cpp b/cpp/ptq/main.cpp