fix errors

jambayk · jambayk · commit 9137985db700 · 2023-10-25T02:58:04.000Z
fix implicit cast

don't import MatMulBnb4Quantizer at top level

sign type

fix doc
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -47,7 +47,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.Inverse">com.microsoft.Inverse</a>
   * <a href="#com.microsoft.Irfft">com.microsoft.Irfft</a>
   * <a href="#com.microsoft.LongformerAttention">com.microsoft.LongformerAttention</a>
-  * <a href="#com.microsoft.MatMulNBits">com.microsoft.MatMulBnb4</a>
+  * <a href="#com.microsoft.MatMulBnb4">com.microsoft.MatMulBnb4</a>
   * <a href="#com.microsoft.MatMulFpQ4">com.microsoft.MatMulFpQ4</a>
   * <a href="#com.microsoft.MatMulInteger16">com.microsoft.MatMulInteger16</a>
   * <a href="#com.microsoft.MatMulIntegerToFloat">com.microsoft.MatMulIntegerToFloat</a>
@@ -2530,7 +2530,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>block_size</tt> : int (required)</dt>
 <dd>number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.</dd>
 <dt><tt>quant_type</tt> : int (required)</dt>
-<dd>Quantization data type. 0 for FP4, 1 for NF4.</dd>
+<dd>quantization data type. 0 for FP4, 1 for NF4.</dd>
 </dl>
 
 #### Inputs
@@ -2541,7 +2541,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>B</tt> : T2</dt>
 <dd>1-dimensional quantized data for weight</dd>
 <dt><tt>absmax</tt> : T1</dt>
-<dd>Quantization constants</dd>
+<dd>quantization constants</dd>
 </dl>
 
 #### Outputs
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -850,7 +850,7 @@ Do not modify directly.*
 |Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
-|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
+|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
diff --git a/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h b/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h
@@ -42,7 +42,7 @@ FORCEINLINE uint8_t QuantizeOneFP4(float x) {
   // that is difficult to noice if you add an extra
   // zero somewhere!
 
-  int sign = x < 0 ? 0b1000 : 0b0000;
+  uint8_t sign = x < 0 ? 0b1000 : 0b0000;
   x = fabsf(x);
   if (x > 0.29166667f) {
     if (x > 0.583333f) {
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h b/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h
@@ -30,7 +30,7 @@ void QuantizeBlockwiseBnb4(
       thread_pool,
       total_block_count,
       [&](ptrdiff_t block_idx) {
-        QuantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], block_idx, numel);
+        QuantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], static_cast<int32_t>(block_idx), numel);
       },
       0);
 }
@@ -87,7 +87,7 @@ void DequantizeBlockwiseBnb4(
       thread_pool,
       total_block_count,
       [&](ptrdiff_t block_idx) {
-        DequantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], block_idx, numel);
+        DequantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], static_cast<int32_t>(block_idx), numel);
       },
       0);
 }
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3255,13 +3255,13 @@ Input absmax is stored in same type as original type of B(float32, float16) with
       .SetDomain(kMSDomain)
       .SinceVersion(1)
       .SetDoc(MatMulBnb4_ver1_doc)
-      .Attr("K", "Size of each input feature.", AttributeProto::INT)
-      .Attr("N", "Size of each output feature.", AttributeProto::INT)
-      .Attr("block_size", "Number of groupsize used for weight quantization.", AttributeProto::INT)
-      .Attr("quant_type", "Type of quantization used. 0 for FP4, 1 for NF4.", AttributeProto::INT)
-      .Input(0, "A", "The input tensor, not quantized.", "T1")
-      .Input(1, "B", "Quantized data for weight.", "T2")
-      .Input(2, "absmax", "Quantization constants for each block.", "T1")
+      .Attr("K", "size of each input feature", AttributeProto::INT)
+      .Attr("N", "size of each output feature", AttributeProto::INT)
+      .Attr("block_size", "number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.", AttributeProto::INT)
+      .Attr("quant_type", "quantization data type. 0 for FP4, 1 for NF4.", AttributeProto::INT)
+      .Input(0, "A", "The input tensor, not quantized", "T1")
+      .Input(1, "B", "1-dimensional quantized data for weight", "T2")
+      .Input(2, "absmax", "quantization constants", "T1")
       .Output(0, "Y", "tensor. The output tensor has the same rank as the input. ", "T1")
       .TypeConstraint("T1", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float/half_float tensors.")
       .TypeConstraint("T2", {"tensor(uint8)"}, "Constrain quantized weight types to uint8.")
diff --git a/onnxruntime/python/tools/quantization/__init__.py b/onnxruntime/python/tools/quantization/__init__.py
@@ -5,7 +5,6 @@
     MinMaxCalibrater,
     create_calibrator,
 )
-from .matmul_bnb4_quantizer import MatMulBnb4Quantizer  # noqa: F401
 from .matmul_weight4_quantizer import MatMulWeight4Quantizer  # noqa: F401
 from .qdq_quantizer import QDQQuantizer  # noqa: F401
 from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
diff --git a/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc b/onnxruntime/test/contrib_ops/matmul_bnb4_test.cc
@@ -72,7 +72,7 @@ void RunTest(int64_t quant_type, int64_t M, int64_t N, int64_t K, int64_t block_
   QuantizeDequantizeBnb4(input1_f_vals,
                          input1_vals,
                          absmax,
-                         quant_type,
+                         static_cast<int32_t>(quant_type),
                          static_cast<int32_t>(N),
                          static_cast<int32_t>(K),
                          static_cast<int32_t>(block_size));
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py b/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py
@@ -13,7 +13,6 @@
 
 import numpy as np
 import onnx
-import parameterized
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
 
@@ -141,7 +140,11 @@ def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_na
 
         onnx.save(model, output_model_path)
 
-    def quant_test(self, model_fp32_path: str, data_reader: TestDataFeeds, quant_type: int, block_size: int):
+    def quant_test(self, quant_type: int, block_size: int):
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath(f"matmul_fp32_{quant_type}.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, quant_type)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+
         model_bnb4_path = str(
             Path(self._tmp_model_dir.name).joinpath(f"MatMulBnb4_{quant_type}_{block_size}.onnx").absolute()
         )
@@ -167,14 +170,16 @@ def quant_test(self, model_fp32_path: str, data_reader: TestDataFeeds, quant_typ
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_bnb4"
     )
-    @parameterized.parameterized.expand([0, 1])
-    def test_quantize_matmul_bnb4(self, quant_type):
+    def test_quantize_matmul_bnb4_fp4(self):
         np.random.seed(13)
+        self.quant_test(0, 64)
 
-        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath(f"matmul_fp32_{quant_type}.onnx").absolute())
-        self.construct_model_matmul(model_fp32_path, quant_type)
-        data_reader = self.input_feeds(1, {"input": [100, 52]})
-        self.quant_test(model_fp32_path, data_reader, quant_type, 64)
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_bnb4"
+    )
+    def test_quantize_matmul_bnb4_nf4(self):
+        np.random.seed(13)
+        self.quant_test(1, 64)
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`MinMaxCalibrater,`
`6`	`6`	`create_calibrator,`
`7`	`7`	`)`
`8`		`-from .matmul_bnb4_quantizer import MatMulBnb4Quantizer # noqa: F401`
`9`	`8`	`from .matmul_weight4_quantizer import MatMulWeight4Quantizer # noqa: F401`
`10`	`9`	`from .qdq_quantizer import QDQQuantizer # noqa: F401`
`11`	`10`	`from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa: F401`