pytorch · q10 · Jul 5, 2025
diff --git a/.clang-tidy b/.clang-tidy
@@ -7,6 +7,7 @@ InheritParentConfig: true
 # @nolint
 Checks: '
 -*,
+cppcoreguidelines-init-variables,
 bugprone-argument-comment,
 misc-use-internal-linkage,
 modernize*,

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -208,8 +208,11 @@ else(MSVC)
   string(APPEND CMAKE_CXX_FLAGS " -Wunknown-pragmas")
   string(APPEND CMAKE_CXX_FLAGS " -Wimplicit-fallthrough")
   string(APPEND CMAKE_CXX_FLAGS " -Wno-strict-aliasing")
+  string(APPEND CMAKE_CXX_FLAGS " -Wunused-variable")
   if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 17.0.0)
     string(APPEND CMAKE_CXX_FLAGS " -Wno-vla-cxx-extension")
+  elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    string(APPEND CMAKE_CXX_FLAGS " -Wmaybe-uninitialized")
   endif()
   target_compile_options(fbgemm_avx2 PRIVATE
     "-m64" "-mavx2" "-mf16c" "-mfma")

diff --git a/bench/AlignedVec.h b/bench/AlignedVec.h
@@ -107,10 +107,9 @@ class aligned_allocator {
 
     // Mallocator wraps malloc().
     void* pv = nullptr;
-    int ret;
+    int ret = 0;
 #ifdef _MSC_VER
     pv = _aligned_malloc(n * sizeof(T), Alignment);
-    ret = 0;
 #else
     ret = posix_memalign(&pv, Alignment, n * sizeof(T));
 #endif

diff --git a/bench/BenchUtils.cc b/bench/BenchUtils.cc
@@ -150,9 +150,8 @@ aligned_vector<float> getRandomSparseVector(
   std::sort(sorted_res.begin(), sorted_res.end());
   int32_t numZeros =
       size - static_cast<int32_t>(std::round(size * fractionNonZeros));
-  float thr;
   if (numZeros) {
-    thr = sorted_res[numZeros - 1];
+    float thr = sorted_res[numZeros - 1];
 
     for (auto& f : res) {
       if (f <= thr) {

diff --git a/bench/BenchUtils.h b/bench/BenchUtils.h
@@ -28,6 +28,7 @@
 
 #ifdef _OPENMP
 #include <omp.h>
+#include <cmath>
 #endif
 
 #ifdef USE_MKL
@@ -136,8 +137,6 @@ double measureWithWarmup(
   {
 #endif
     for (int i = 0; i < measuredIterations; ++i) {
-      std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
-
       const auto thread_id = useOpenMP ? fbgemm_get_thread_num() : 0;
 
       if (thread_id == 0) {
@@ -149,7 +148,7 @@ double measureWithWarmup(
 #pragma omp barrier
       }
 #endif
-      start = std::chrono::high_resolution_clock::now();
+      auto start = std::chrono::high_resolution_clock::now();
 
       fn();
 
@@ -159,7 +158,7 @@ double measureWithWarmup(
       }
 #endif
 
-      end = std::chrono::high_resolution_clock::now();
+      auto end = std::chrono::high_resolution_clock::now();
       auto dur =
           std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
 
@@ -256,7 +255,6 @@ void performance_test(
 #endif
 
   std::string type;
-  double gflops, gbs, ttot;
   for (auto s : shapes) {
     int m = s[0];
     int n = s[1];
@@ -266,6 +264,7 @@ void performance_test(
     aligned_vector<int> Aint(m * k);
     randFill(Aint, 0, 4);
     std::vector<aligned_vector<float>> A;
+    A.reserve(num_instances);
     for (int i = 0; i < num_instances; ++i) {
       A.emplace_back(Aint.begin(), Aint.end());
     }
@@ -321,6 +320,7 @@ void performance_test(
 
     double nflops = 2.0 * m * n * k;
     double nbytes = 4.0 * m * k + sizeof(btype) * 1.0 * k * n + 4.0 * m * n;
+    double gflops = 0, gbs = 0, ttot = 0.0;
 
     // warm up MKL and fbgemm
     // check correctness at the same time

diff --git a/bench/EmbeddingIndexRemappingBenchmark.cc b/bench/EmbeddingIndexRemappingBenchmark.cc
@@ -135,9 +135,9 @@ static int run_benchmark(
 }
 
 int main() {
-  int batch_size;
-  int num_rows;
-  int average_len;
+  int batch_size = 0;
+  int num_rows = 0;
+  int average_len = 0;
 
   vector<vector<int>> inputs(GetInputs_());
 

diff --git a/bench/EmbeddingSpMDMBenchmark.cc b/bench/EmbeddingSpMDMBenchmark.cc
@@ -12,7 +12,6 @@
 #endif
 #include <algorithm>
 #include <cassert>
-#include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <iomanip>

diff --git a/bench/EmbeddingSpMDMNBit2Benchmark.cc b/bench/EmbeddingSpMDMNBit2Benchmark.cc
@@ -12,7 +12,6 @@
 #endif
 #include <algorithm>
 #include <cassert>
-#include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <iomanip>

diff --git a/bench/EmbeddingSpMDMNBitBenchmark.cc b/bench/EmbeddingSpMDMNBitBenchmark.cc
@@ -492,10 +492,10 @@ static int run_benchmark(
 }
 
 int main() {
-  int batch_size;
-  int num_rows;
-  int embedding_dim;
-  int average_len;
+  int batch_size = 0;
+  int num_rows = 0;
+  int embedding_dim = 0;
+  int average_len = 0;
 
   vector<vector<int>> inputs(GetInputs_());
 

diff --git a/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc b/bench/EmbeddingSpMDMNBitRowWiseSparseBenchmark.cc
@@ -325,10 +325,10 @@ static int run_benchmark(
 }
 
 int main() {
-  int batch_size;
-  int num_rows;
-  int embedding_dim;
-  int average_len;
+  int batch_size = 0;
+  int num_rows = 0;
+  int embedding_dim = 0;
+  int average_len = 0;
 
   vector<vector<int>> inputs(GetInputs_());
 

diff --git a/bench/RowwiseAdagradBenchmark.cc b/bench/RowwiseAdagradBenchmark.cc
@@ -192,9 +192,9 @@ static void run_benchmark(
 }
 
 int main() {
-  int num_rows;
-  int block_size;
-  uint64_t param_size;
+  int num_rows = 0;
+  int block_size = 0;
+  uint64_t param_size = 0;
   vector<vector<int>> inputs(GetInputs_());
 
   for (auto isIndex64b : vector<bool>{true, false}) {

diff --git a/bench/SparseAdagradBenchmark.cc b/bench/SparseAdagradBenchmark.cc
@@ -199,9 +199,9 @@ static void run_benchmark(
 }
 
 int main() {
-  int num_rows;
-  int block_size;
-  uint64_t param_size;
+  int num_rows = 0;
+  int block_size = 0;
+  uint64_t param_size = 0;
   vector<vector<int>> inputs(GetInputs_());
 
   for (auto isIndex64b : vector<bool>{true, false}) {

diff --git a/include/fbgemm/ConvUtils.h b/include/fbgemm/ConvUtils.h
@@ -16,30 +16,26 @@
 namespace fbgemm {
 
 template <int N, int... Vals>
-constexpr
-    typename std::enable_if<N == sizeof...(Vals), std::array<int, N>>::type
-    array_of_ones() {
+constexpr std::enable_if_t<N == sizeof...(Vals), std::array<int, N>>
+array_of_ones() {
   return std::array<int, N>{{Vals...}};
 }
 
 template <int N, int... Vals>
-constexpr
-    typename std::enable_if<N != sizeof...(Vals), std::array<int, N>>::type
-    array_of_ones() {
+constexpr std::enable_if_t<N != sizeof...(Vals), std::array<int, N>>
+array_of_ones() {
   return array_of_ones<N, Vals..., 1>();
 }
 
 template <int N, int... Vals>
-constexpr
-    typename std::enable_if<N == sizeof...(Vals), std::array<int, N>>::type
-    array_of_zeroes() {
+constexpr std::enable_if_t<N == sizeof...(Vals), std::array<int, N>>
+array_of_zeroes() {
   return std::array<int, N>{{Vals...}};
 }
 
 template <int N, int... Vals>
-constexpr
-    typename std::enable_if<N != sizeof...(Vals), std::array<int, N>>::type
-    array_of_zeroes() {
+constexpr std::enable_if_t<N != sizeof...(Vals), std::array<int, N>>
+array_of_zeroes() {
   return array_of_zeroes<N, Vals..., 0>();
 }
 

diff --git a/include/fbgemm/FloatConversion.h b/include/fbgemm/FloatConversion.h
@@ -8,6 +8,8 @@
 
 #pragma once
 
+#include <math.h>
+
 #include <cassert>
 #include <climits>
 #include <cstdint>
@@ -211,7 +213,7 @@ template <typename Src, typename Tgt, RoundingMode RoundingMode>
 } // namespace detail
 
 inline float16 cpu_float2half_rn(float f) {
-  uint32_t f_u32;
+  uint32_t f_u32 = 0;
   std::memcpy(&f_u32, &f, sizeof(f_u32));
   return detail::ieee754_trunc<
       /*Src=*/detail::IEEE754Single,
@@ -220,7 +222,7 @@ inline float16 cpu_float2half_rn(float f) {
 }
 
 inline float16 cpu_float2half_rz(float f) {
-  uint32_t f_u32;
+  uint32_t f_u32 = 0;
   std::memcpy(&f_u32, &f, sizeof(f_u32));
   return detail::ieee754_trunc<
       /*Src=*/detail::IEEE754Single,
@@ -263,7 +265,7 @@ inline float cpu_half2float_ref(const float16 h) {
     exponent = f32_exponent_mask;
   } else if (!exponent) { // Denorm or Zero
     if (mantissa) {
-      uint32_t msb;
+      uint32_t msb = 0;
       exponent = f32_exponent_bias - f16_exponent_bias + 1;
       do {
         msb = mantissa & f32_most_significant_bit;
@@ -279,7 +281,7 @@ inline float cpu_half2float_ref(const float16 h) {
   const uint32_t i = (sign_bit << f32_num_non_sign_bits) |
       (exponent << f32_num_mantissa_bits) | mantissa;
 
-  float ret;
+  float ret = NAN;
   std::memcpy(&ret, &i, sizeof(float));
   return ret;
 }
@@ -288,7 +290,7 @@ inline float cpu_half2float_ref(const float16 h) {
 // conversion provided by the compiler
 inline float cpu_half2float(const float16 h) {
 #if defined(HAS_NATIVE_FP16_TYPE) && not defined(MISSING_GNU_F2H_IEEE)
-  __fp16 h_fp16;
+  __fp16 h_fp16 = NAN;
   std::memcpy(&h_fp16, &h, sizeof(__fp16));
   return h_fp16;
 #else
@@ -299,7 +301,7 @@ inline float cpu_half2float(const float16 h) {
 inline float16 cpu_float2half(const float f) {
 #if defined(HAS_NATIVE_FP16_TYPE) && not defined(MISSING_GNU_F2H_IEEE)
   __fp16 h = f;
-  float16 res;
+  float16 res = 0;
   std::memcpy(&res, &h, sizeof(__fp16));
   return res;
 #else
@@ -308,15 +310,15 @@ inline float16 cpu_float2half(const float f) {
 }
 
 inline float cpu_bf162float(bfloat16 src) {
-  float ret;
+  float ret = NAN;
   uint32_t val_fp32 =
       static_cast<uint32_t>(reinterpret_cast<const uint16_t*>(&src)[0]) << 16;
   std::memcpy(&ret, &val_fp32, sizeof(float));
   return ret;
 }
 
 inline bfloat16 cpu_float2bfloat16(float src) {
-  uint32_t temp;
+  uint32_t temp = 0;
   std::memcpy(&temp, &src, sizeof(uint32_t));
   return (temp + (1u << 15)) >> 16;
 }

diff --git a/include/fbgemm/QuantUtils.h b/include/fbgemm/QuantUtils.h
@@ -68,7 +68,7 @@ T Quantize(
     std::int32_t zero_point,
     float scale,
     int result_precision,
-    bool result_is_signed = std::is_signed<T>::value) {
+    bool result_is_signed = std::is_signed_v<T>) {
   // Note: We want to multiply with src with inv_scale instead of
   // dividing src by scale. The same is done in vector code and
   // at other places.
@@ -162,7 +162,7 @@ void Dequantize(
     const TensorQuantizationParams& qparams,
     int thread_id = 0,
     int num_threads = 1) {
-  int64_t i_begin, i_end;
+  int64_t i_begin = 0, i_end = 0;
   fbgemmPartition1D(thread_id, num_threads, len, i_begin, i_end);
   for (int64_t i = i_begin; i < i_end; i++) {
     dst[i] = Dequantize(src[i], qparams);

diff --git a/include/fbgemm/QuantUtilsAvx2.h b/include/fbgemm/QuantUtilsAvx2.h
@@ -71,7 +71,7 @@ void FusedQuantizeDequantizeAvx2(
 ///
 /// Random number generator in [0, 9] based on
 /// <a href="https://www.jstatsoft.org/v08/i14/paper">this paper</a>.
-uint32_t FBGEMM_API Xor128(void);
+uint32_t FBGEMM_API Xor128();
 
 /// @ingroup fbgemm-quant-utils-avx2
 ///

diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
@@ -35,7 +35,7 @@ namespace fbgemm {
 template <typename T>
 struct is_8bit {
   static constexpr bool value =
-      std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+      std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>;
 };
 
 /**
@@ -263,8 +263,8 @@ std::string arrayToString(const std::array<T, SIZE>& inp) {
 
 template <typename accT = std::int32_t>
 bool isValidBlockingFactor(const BlockingFactors* const param) {
-  constexpr bool is_32bit = std::is_same<accT, int32_t>::value;
-  constexpr bool is_16bit = std::is_same<accT, int16_t>::value;
+  constexpr bool is_32bit = std::is_same_v<accT, int32_t>;
+  constexpr bool is_16bit = std::is_same_v<accT, int16_t>;
   static const auto iset = fbgemmInstructionSet();
 
   if constexpr (is_32bit) {
@@ -447,7 +447,7 @@ void nbit_embedding_sanity_check(
   assert(
       (input_bit_rate == 2 || input_bit_rate == 4) &&
       "input_bit_rate must be 2 or 4");
-  if constexpr (std::is_same<OutType, uint8_t>::value) {
+  if constexpr (std::is_same_v<OutType, uint8_t>) {
     assert(
         (no_bag && input_bit_rate == 4 && output_bit_rate == 4) &&
         "we currently only support int4 to int4 for sequential TBE");

diff --git a/src/CodeGenHelpers.h b/src/CodeGenHelpers.h
@@ -91,8 +91,8 @@ template <
         int> = 0>
 void emitExtractHalfVector(
     x86::Emitter* a,
-    x86::Ymm half,
-    const x86::Zmm vec,
+    const x86::Ymm& half,
+    const x86::Zmm& vec,
     int idx) {
   a->vextracti32x8(half, vec, idx);
 }
@@ -107,8 +107,8 @@ template <
         int> = 0>
 void emitExtractHalfVector(
     x86::Emitter* a,
-    x86::Xmm half,
-    x86::Ymm vec,
+    const x86::Xmm& half,
+    const x86::Ymm& vec,
     int idx) {
   a->vextracti32x4(half, vec, idx);
 }
@@ -119,8 +119,8 @@ template <
     std::enable_if_t<instSet == inst_set_t::avx2, int> = 0>
 void emitExtractHalfVector(
     x86::Emitter* a,
-    x86::Xmm half,
-    x86::Ymm vec,
+    const x86::Xmm& half,
+    const x86::Ymm& vec,
     int idx) {
   a->vextracti128(half, vec, idx);
 }