microsoft
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h‎
Lines changed: 68 additions & 47 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h‎
Lines changed: 68 additions & 47 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h‎
Lines changed: 6 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc‎
Lines changed: 14 additions & 13 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/matmul_bnb4.cc‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu‎
Lines changed: 58 additions & 37 deletions b/‎onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu‎
Lines changed: 58 additions & 37 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cuh‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cuh‎
Lines changed: 1 addition & 1 deletion
@@ -44,67 +44,79 @@ FORCEINLINE uint8_t QuantizeOneFP4(float x) {
 
   int sign = x < 0 ? 0b1000 : 0b0000;
   x = fabsf(x);
-  if (x > 0.29166667f)
-    if (x > 0.583333f)
-      if (x > 0.8333333f)
+  if (x > 0.29166667f) {
+    if (x > 0.583333f) {
+      if (x > 0.8333333f) {
         return 0b0011 + sign;
-      else
+      } else {
         return 0b0010 + sign;
-    else if (x > 0.4166667f)
+      }
+    } else if (x > 0.4166667f) {
       return 0b101 + sign;
-    else
+    } else {
       return 0b100 + sign;
-  else if (x > 0.0859375f)
-    if (x > 0.20833333f)
+    }
+  } else if (x > 0.0859375f) {
+    if (x > 0.20833333f) {
       return 0b0111 + sign;
-    else
+    } else {
       return 0b0110 + sign;
-  else if (x > 0.00260417f)
+    }
+  } else if (x > 0.00260417f) {
     return 0b0001 + sign;
-  else
+  } else {
     return 0b0000 + sign;
+  }
 }
 
 FORCEINLINE uint8_t QuantizeOneNF4(float x) {
-  if (x > 0.03979014977812767f)
-    if (x > 0.3893125355243683f)      // 1
-      if (x > 0.6427869200706482f)    // 11
-        if (x > 0.8614784181118011f)  // 111
+  if (x > 0.03979014977812767f) {
+    if (x > 0.3893125355243683f) {      // 1
+      if (x > 0.6427869200706482f) {    // 11
+        if (x > 0.8614784181118011f) {  // 111
           return 0b1111;
-        else
+        } else {
           return 0b1110;
-      else if (x > 0.5016634166240692f)  // 110
+        }
+      } else if (x > 0.5016634166240692f) {  // 110
         return 0b1101;
-      else
+      } else {
         return 0b1100;
-    else if (x > 0.2035212516784668f)  // 10
-      if (x > 0.2920137718319893f)     // 101
+      }
+    } else if (x > 0.2035212516784668f) {  // 10
+      if (x > 0.2920137718319893f) {       // 101
         return 0b1011;
-      else
+      } else {
         return 0b1010;
-    else if (x > 0.1202552504837513f)  // 100
+      }
+    } else if (x > 0.1202552504837513f) {  // 100
       return 0b1001;
-    else
+    } else {
       return 0b1000;
-  else if (x > -0.33967943489551544f)  // 0
-    if (x > -0.13791173323988914f)     // 01
-      if (x > -0.045525018125772476f)  // 011
+    }
+  } else if (x > -0.33967943489551544f) {  // 0
+    if (x > -0.13791173323988914f) {       // 01
+      if (x > -0.045525018125772476f) {    // 011
         return 0b0111;
-      else
+      } else {
         return 0b0110;
-    else if (x > -0.23460740596055984f)  // 010
+      }
+    } else if (x > -0.23460740596055984f) {  // 010
       return 0b0101;
-    else
+    } else {
       return 0b0100;
-  else if (x > -0.6106329262256622f)  // 00
-    if (x > -0.4599952697753906f)     // 001
+    }
+  } else if (x > -0.6106329262256622f) {  // 00
+    if (x > -0.4599952697753906f) {       // 001
       return 0b0011;
-    else
+    } else {
       return 0b0010;
-  else if (x > -0.8480964004993439f)  // 000
+    }
+  } else if (x > -0.8480964004993439f) {  // 000
     return 0b0001;
-  else
+  } else {
     return 0b0000;
+  }
 }
 
 template <int32_t DATA_TYPE>
@@ -142,17 +154,27 @@ FORCEINLINE void QuantizeBlockBnb4(const T* src, uint8_t* dst, T& absmax_block,
   }
 }
 
-static float fp4_qaunt_map[16] = {
-    0.00000000f, 5.208333333e-03f, 0.66666667f, 1.00000000f,
-    0.33333333f, 0.50000000f, 0.16666667f, 0.25000000f,
-    -0.00000000f, -5.208333333e-03f, -0.66666667f, -1.00000000f,
-    -0.33333333f, -0.50000000f, -0.16666667f, -0.25000000f};
-
-static float nf4_qaunt_map[16] = {
-    -1.0f, -0.6961928009986877f, -0.5250730514526367f, -0.39491748809814453f,
-    -0.28444138169288635f, -0.18477343022823334f, -0.09105003625154495f, 0.0f,
-    0.07958029955625534f, 0.16093020141124725f, 0.24611230194568634f, 0.33791524171829224f,
-    0.44070982933044434f, 0.5626170039176941f, 0.7229568362236023f, 1.0f};
+static float fp4_qaunt_map[16] = {0.00000000f,  5.208333333e-03f,  0.66666667f,  1.00000000f,
+                                  0.33333333f,  0.50000000f,       0.16666667f,  0.25000000f,
+                                  -0.00000000f, -5.208333333e-03f, -0.66666667f, -1.00000000f,
+                                  -0.33333333f, -0.50000000f,      -0.16666667f, -0.25000000f};
+
+static float nf4_qaunt_map[16] = {-1.0f,
+                                  -0.6961928009986877f,
+                                  -0.5250730514526367f,
+                                  -0.39491748809814453f,
+                                  -0.28444138169288635f,
+                                  -0.18477343022823334f,
+                                  -0.09105003625154495f,
+                                  0.0f,
+                                  0.07958029955625534f,
+                                  0.16093020141124725f,
+                                  0.24611230194568634f,
+                                  0.33791524171829224f,
+                                  0.44070982933044434f,
+                                  0.5626170039176941f,
+                                  0.7229568362236023f,
+                                  1.0f};
 
 template <typename T, int32_t DATA_TYPE>
 FORCEINLINE T DequantizeOneBnb4(uint8_t x) {
@@ -172,8 +194,7 @@ FORCEINLINE void DequantizeBlockBnb4(const uint8_t* src, T* dst, T absmax_block,
     const uint8_t val = src[src_offset + idx / 2];
 
     dst[dst_offset + idx] = DequantizeOneBnb4<T, DATA_TYPE>(val >> 4) * absmax_block;
-    if (idx + 1 < block_len)
-      dst[dst_offset + idx + 1] = DequantizeOneBnb4<T, DATA_TYPE>(val & 0xF) * absmax_block;
+    if (idx + 1 < block_len) dst[dst_offset + idx + 1] = DequantizeOneBnb4<T, DATA_TYPE>(val & 0xF) * absmax_block;
   }
 }
 
 
@@ -51,7 +51,9 @@ void QuantizeBlockwiseBnb4(
     int32_t N,
     int32_t K,
     onnxruntime::concurrency::ThreadPool* thread_pool) {
-  ORT_ENFORCE(quant_type == FP4 || quant_type == NF4, "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+  ORT_ENFORCE(
+      quant_type == FP4 || quant_type == NF4,
+      "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
 
   if (block_size == 16) {
     QuantizeBlockwiseBn4DataTyped(16, quant_type);
@@ -106,7 +108,9 @@ void DequantizeBlockwiseBnb4(
     int32_t N,
     int32_t K,
     onnxruntime::concurrency::ThreadPool* thread_pool) {
-  ORT_ENFORCE(quant_type == FP4 || quant_type == NF4, "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+  ORT_ENFORCE(
+      quant_type == FP4 || quant_type == NF4,
+      "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
 
   if (block_size == 16) {
     DequantizeBlockwiseBn4DataTyped(16, quant_type);
 
@@ -18,7 +18,9 @@ class MatMulBnb4 final : public OpKernel {
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("N", &N_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("block_size", &block_size_));
     ORT_ENFORCE(Status::OK() == info.GetAttr<int64_t>("quant_type", &quant_type_));
-    ORT_ENFORCE(quant_type_ == FP4 || quant_type_ == NF4, "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+    ORT_ENFORCE(
+        quant_type_ == FP4 || quant_type_ == NF4,
+        "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -45,14 +47,15 @@ Status MatMulBnb4::Compute(OpKernelContext* ctx) const {
   auto status = ctx->GetTempSpaceAllocator(&allocator);
   ORT_RETURN_IF_ERROR(status);
   auto tmp_b_data_ptr = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_);
-  DequantizeBlockwiseBnb4<float>(tmp_b_data_ptr.get(),
-                                 b_quant_data,
-                                 absmax_data,
-                                 static_cast<int32_t>(block_size_),
-                                 static_cast<int32_t>(quant_type_),
-                                 static_cast<int32_t>(N_),
-                                 static_cast<int32_t>(K_),
-                                 thread_pool);
+  DequantizeBlockwiseBnb4<float>(
+      tmp_b_data_ptr.get(),
+      b_quant_data,
+      absmax_data,
+      static_cast<int32_t>(block_size_),
+      static_cast<int32_t>(quant_type_),
+      static_cast<int32_t>(N_),
+      static_cast<int32_t>(K_),
+      thread_pool);
 
   constexpr bool transa = false;
   constexpr bool transb = true;
@@ -63,8 +66,7 @@ Status MatMulBnb4::Compute(OpKernelContext* ctx) const {
   Tensor* y = ctx->Output(0, helper.OutputShape());
 
   // Bail out early if the output is going to be empty
-  if (y->Shape().Size() == 0)
-    return Status::OK();
+  if (y->Shape().Size() == 0) return Status::OK();
 
   auto* y_data = y->MutableData<float>();
 
@@ -88,8 +90,7 @@ Status MatMulBnb4::Compute(OpKernelContext* ctx) const {
     data[i].alpha = 1.f;
     data[i].beta = 0.0f;
   }
-  MlasGemmBatch(CblasNoTrans, CblasTrans,
-                M, N, K, data.data(), max_len, thread_pool);
+  MlasGemmBatch(CblasNoTrans, CblasTrans, M, N, K, data.data(), max_len, thread_pool);
 
   return Status::OK();
 }
 
@@ -11,23 +11,22 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-template<class T>
-Status SetBnbQuantMap(int quant_type, T* quant_map_buffer, cudaStream_t stream)
-{
-  ORT_ENFORCE(quant_type == FP4 || quant_type == NF4, "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
-  
+template <class T>
+Status SetBnbQuantMap(int quant_type, T* quant_map_buffer, cudaStream_t stream) {
+  ORT_ENFORCE(
+      quant_type == FP4 || quant_type == NF4,
+      "Invalid quant_type, only 0 (FP4) and 1 (NF4) are supported.");
+
   T host_quant_map[16];
   switch (quant_type) {
     case FP4:
-      for(int i = 0; i < 16; i++)
-        host_quant_map[i] = static_cast<T>(fp4_qaunt_map[i]);
+      for (int i = 0; i < 16; i++) host_quant_map[i] = static_cast<T>(fp4_qaunt_map[i]);
       break;
     case NF4:
-      for(int i = 0; i < 16; i++)
-        host_quant_map[i] = static_cast<T>(nf4_qaunt_map[i]);
+      for (int i = 0; i < 16; i++) host_quant_map[i] = static_cast<T>(nf4_qaunt_map[i]);
       break;
   }
-  CUDA_CALL_THROW(cudaMemcpyAsync(quant_map_buffer, host_quant_map, sizeof(T)*16, cudaMemcpyHostToDevice, stream));
+  CUDA_CALL_THROW(cudaMemcpyAsync(quant_map_buffer, host_quant_map, sizeof(T) * 16, cudaMemcpyHostToDevice, stream));
 
   return Status::OK();
 }
@@ -36,60 +35,82 @@ template Status SetBnbQuantMap<float>(int quant_type, float* quant_map_buffer, c
 
 template Status SetBnbQuantMap<half>(int quant_type, half* quant_map_buffer, cudaStream_t stream);
 
-
-template<typename T, int TILE_SIZE, int THREADS, int NUM_PER_TH>
-__global__ void kDequantizeBlockwise(const T *quant_map, T *output, const unsigned char *quant_data, const T *absmax, const int block_size, const int n)
-{
+template <typename T, int TILE_SIZE, int THREADS, int NUM_PER_TH>
+__global__ void kDequantizeBlockwise(
+    const T* quant_map,
+    T* output,
+    const uint8_t* quant_data,
+    const T* absmax,
+    const int block_size,
+    const int n) {
   const int n_load = (gridDim.x * TILE_SIZE);
   int valid_items_load = 0;
   int valid_items_store = 0;
   const int base_idx = (blockIdx.x * TILE_SIZE);
 
-  T vals[NUM_PER_TH*2];
-  unsigned char qvals[NUM_PER_TH];
+  T vals[NUM_PER_TH * 2];
+  uint8_t qvals[NUM_PER_TH];
   T local_abs_max = T(0.0f);
 
-  typedef cub::BlockLoad<unsigned char, THREADS, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
-  typedef cub::BlockStore<T, THREADS, NUM_PER_TH*2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
+  typedef cub::BlockLoad<uint8_t, THREADS, NUM_PER_TH, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadChar;
+  typedef cub::BlockStore<T, THREADS, NUM_PER_TH * 2, cub::BLOCK_STORE_WARP_TRANSPOSE> StoreT;
 
   __shared__ typename LoadChar::TempStorage loadchar;
   __shared__ typename StoreT::TempStorage storet;
 
-  for (unsigned int i = base_idx; i < n_load; i += gridDim.x*TILE_SIZE)
-  {
-    valid_items_load = (n+1)/2 - i > TILE_SIZE ? TILE_SIZE : (n+1)/2 - i;
-    valid_items_store = n - i*2 > TILE_SIZE*2 ? TILE_SIZE*2 : n - i*2;
+  for (unsigned int i = base_idx; i < n_load; i += gridDim.x * TILE_SIZE) {
+    valid_items_load = (n + 1) / 2 - i > TILE_SIZE ? TILE_SIZE : (n + 1) / 2 - i;
+    valid_items_store = n - i * 2 > TILE_SIZE * 2 ? TILE_SIZE * 2 : n - i * 2;
 
-    local_abs_max = __ldg(&absmax[(i+threadIdx.x*NUM_PER_TH)/(block_size)]);
+    local_abs_max = __ldg(&absmax[(i + threadIdx.x * NUM_PER_TH) / (block_size)]);
 
     __syncthreads();
     LoadChar(loadchar).Load(&(quant_data[i]), qvals, valid_items_load, 128);
 
     #pragma unroll NUM_PER_TH
-    for(int j = 0; j < NUM_PER_TH; j++)
-    {
-      vals[j*2] = quant_map[qvals[j] >> 4] * local_abs_max;
-      vals[j*2 + 1] = quant_map[qvals[j] & 0x0F] * local_abs_max;
+    for (int j = 0; j < NUM_PER_TH; j++) {
+      vals[j * 2] = quant_map[qvals[j] >> 4] * local_abs_max;
+      vals[j * 2 + 1] = quant_map[qvals[j] & 0x0F] * local_abs_max;
     }
 
     __syncthreads();
-    StoreT(storet).Store(&(output[i*2]), vals, valid_items_store);
+    StoreT(storet).Store(&(output[i * 2]), vals, valid_items_store);
   }
 }
 
-
-template<class T>
-Status DequantizeBnb4(const T* quant_map, T *output, const unsigned char *quant_data, const T *absmax, int block_size, int numel, cudaStream_t stream)
-{
+template <class T>
+Status DequantizeBnb4(
+    const T* quant_map,
+    T* output,
+    const uint8_t* quant_data,
+    const T* absmax,
+    int block_size,
+    int numel,
+    cudaStream_t stream) {
   int tile_size = 1024;
-  kDequantizeBlockwise<T, 512, 64, 8><<<(numel+tile_size-1)/tile_size, 64, 0, stream>>>(quant_map, output, quant_data, absmax, block_size/2, numel);
-    
+  kDequantizeBlockwise<T, 512, 64, 8><<<(numel + tile_size - 1) / tile_size, 64, 0, stream>>>(
+      quant_map, output, quant_data, absmax, block_size / 2, numel);
+
   return Status::OK();
 }
 
-template Status DequantizeBnb4<float>(const float* quant_map, float *output, const unsigned char *quant_data, const float *absmax, int block_size, int numel, cudaStream_t stream);
-
-template Status DequantizeBnb4<half>(const half* quant_map, half *output, const unsigned char *quant_data, const half *absmax, int block_size, int numel, cudaStream_t stream);
+template Status DequantizeBnb4<float>(
+    const float* quant_map,
+    float* output,
+    const uint8_t* quant_data,
+    const float* absmax,
+    int block_size,
+    int numel,
+    cudaStream_t stream);
+
+template Status DequantizeBnb4<half>(
+    const half* quant_map,
+    half* output,
+    const uint8_t* quant_data,
+    const half *absmax,
+    int block_size,
+    int numel,
+    cudaStream_t stream);
 
 }  // namespace cuda
 }  // namespace contrib
 
@@ -15,7 +15,7 @@ template <class T>
 Status DequantizeBnb4(
     const T* quant_map,
     T* output,
-    const unsigned char* quant_data, 
+    const uint8_t* quant_data,
     const T* absmax,
     int block_size,
     int numel,