Skip to content

Commit 9137985

Browse files
committed
fix errors
fix implicit cast don't import MatMulBnb4Quantizer at top level sign type fix doc
1 parent 6836e69 commit 9137985

File tree

8 files changed

+28
-24
lines changed

8 files changed

+28
-24
lines changed

docs/ContribOperators.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Do not modify directly.*
4747
* <a href="#com.microsoft.Inverse">com.microsoft.Inverse</a>
4848
* <a href="#com.microsoft.Irfft">com.microsoft.Irfft</a>
4949
* <a href="#com.microsoft.LongformerAttention">com.microsoft.LongformerAttention</a>
50-
* <a href="#com.microsoft.MatMulNBits">com.microsoft.MatMulBnb4</a>
50+
* <a href="#com.microsoft.MatMulBnb4">com.microsoft.MatMulBnb4</a>
5151
* <a href="#com.microsoft.MatMulFpQ4">com.microsoft.MatMulFpQ4</a>
5252
* <a href="#com.microsoft.MatMulInteger16">com.microsoft.MatMulInteger16</a>
5353
* <a href="#com.microsoft.MatMulIntegerToFloat">com.microsoft.MatMulIntegerToFloat</a>
@@ -2530,7 +2530,7 @@ This version of the operator has been available since version 1 of the 'com.micr
25302530
<dt><tt>block_size</tt> : int (required)</dt>
25312531
<dd>number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.</dd>
25322532
<dt><tt>quant_type</tt> : int (required)</dt>
2533-
<dd>Quantization data type. 0 for FP4, 1 for NF4.</dd>
2533+
<dd>quantization data type. 0 for FP4, 1 for NF4.</dd>
25342534
</dl>
25352535

25362536
#### Inputs
@@ -2541,7 +2541,7 @@ This version of the operator has been available since version 1 of the 'com.micr
25412541
<dt><tt>B</tt> : T2</dt>
25422542
<dd>1-dimensional quantized data for weight</dd>
25432543
<dt><tt>absmax</tt> : T1</dt>
2544-
<dd>Quantization constants</dd>
2544+
<dd>quantization constants</dd>
25452545
</dl>
25462546

25472547
#### Outputs

docs/OperatorKernels.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -850,7 +850,7 @@ Do not modify directly.*
850850
|Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
851851
|Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
852852
|LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
853-
|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
853+
|MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
854854
|MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T2**<br> *out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
855855
|MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* relative_position_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)|
856856
|NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|

onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block_bnb4.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ FORCEINLINE uint8_t QuantizeOneFP4(float x) {
4242
// that is difficult to noice if you add an extra
4343
// zero somewhere!
4444

45-
int sign = x < 0 ? 0b1000 : 0b0000;
45+
uint8_t sign = x < 0 ? 0b1000 : 0b0000;
4646
x = fabsf(x);
4747
if (x > 0.29166667f) {
4848
if (x > 0.583333f) {

onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ void QuantizeBlockwiseBnb4(
3030
thread_pool,
3131
total_block_count,
3232
[&](ptrdiff_t block_idx) {
33-
QuantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], block_idx, numel);
33+
QuantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], static_cast<int32_t>(block_idx), numel);
3434
},
3535
0);
3636
}
@@ -87,7 +87,7 @@ void DequantizeBlockwiseBnb4(
8787
thread_pool,
8888
total_block_count,
8989
[&](ptrdiff_t block_idx) {
90-
DequantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], block_idx, numel);
90+
DequantizeBlockBnb4<T, block_size, DATA_TYPE>(src, dst, absmax[block_idx], static_cast<int32_t>(block_idx), numel);
9191
},
9292
0);
9393
}

onnxruntime/core/graph/contrib_ops/contrib_defs.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3255,13 +3255,13 @@ Input absmax is stored in same type as original type of B(float32, float16) with
32553255
.SetDomain(kMSDomain)
32563256
.SinceVersion(1)
32573257
.SetDoc(MatMulBnb4_ver1_doc)
3258-
.Attr("K", "Size of each input feature.", AttributeProto::INT)
3259-
.Attr("N", "Size of each output feature.", AttributeProto::INT)
3260-
.Attr("block_size", "Number of groupsize used for weight quantization.", AttributeProto::INT)
3261-
.Attr("quant_type", "Type of quantization used. 0 for FP4, 1 for NF4.", AttributeProto::INT)
3262-
.Input(0, "A", "The input tensor, not quantized.", "T1")
3263-
.Input(1, "B", "Quantized data for weight.", "T2")
3264-
.Input(2, "absmax", "Quantization constants for each block.", "T1")
3258+
.Attr("K", "size of each input feature", AttributeProto::INT)
3259+
.Attr("N", "size of each output feature", AttributeProto::INT)
3260+
.Attr("block_size", "number of groupsize used for weight quantization,(default 128). It needs to be a power of 2 and not smaller than 16.", AttributeProto::INT)
3261+
.Attr("quant_type", "quantization data type. 0 for FP4, 1 for NF4.", AttributeProto::INT)
3262+
.Input(0, "A", "The input tensor, not quantized", "T1")
3263+
.Input(1, "B", "1-dimensional quantized data for weight", "T2")
3264+
.Input(2, "absmax", "quantization constants", "T1")
32653265
.Output(0, "Y", "tensor. The output tensor has the same rank as the input. ", "T1")
32663266
.TypeConstraint("T1", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float/half_float tensors.")
32673267
.TypeConstraint("T2", {"tensor(uint8)"}, "Constrain quantized weight types to uint8.")

onnxruntime/python/tools/quantization/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
MinMaxCalibrater,
66
create_calibrator,
77
)
8-
from .matmul_bnb4_quantizer import MatMulBnb4Quantizer # noqa: F401
98
from .matmul_weight4_quantizer import MatMulWeight4Quantizer # noqa: F401
109
from .qdq_quantizer import QDQQuantizer # noqa: F401
1110
from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa: F401

onnxruntime/test/contrib_ops/matmul_bnb4_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ void RunTest(int64_t quant_type, int64_t M, int64_t N, int64_t K, int64_t block_
7272
QuantizeDequantizeBnb4(input1_f_vals,
7373
input1_vals,
7474
absmax,
75-
quant_type,
75+
static_cast<int32_t>(quant_type),
7676
static_cast<int32_t>(N),
7777
static_cast<int32_t>(K),
7878
static_cast<int32_t>(block_size));

onnxruntime/test/python/quantization/test_op_matmul_bnb4.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
import numpy as np
1515
import onnx
16-
import parameterized
1716
from onnx import TensorProto, helper
1817
from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
1918

@@ -141,7 +140,11 @@ def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_na
141140

142141
onnx.save(model, output_model_path)
143142

144-
def quant_test(self, model_fp32_path: str, data_reader: TestDataFeeds, quant_type: int, block_size: int):
143+
def quant_test(self, quant_type: int, block_size: int):
144+
model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath(f"matmul_fp32_{quant_type}.onnx").absolute())
145+
self.construct_model_matmul(model_fp32_path, quant_type)
146+
data_reader = self.input_feeds(1, {"input": [100, 52]})
147+
145148
model_bnb4_path = str(
146149
Path(self._tmp_model_dir.name).joinpath(f"MatMulBnb4_{quant_type}_{block_size}.onnx").absolute()
147150
)
@@ -167,14 +170,16 @@ def quant_test(self, model_fp32_path: str, data_reader: TestDataFeeds, quant_typ
167170
@unittest.skipIf(
168171
find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_bnb4"
169172
)
170-
@parameterized.parameterized.expand([0, 1])
171-
def test_quantize_matmul_bnb4(self, quant_type):
173+
def test_quantize_matmul_bnb4_fp4(self):
172174
np.random.seed(13)
175+
self.quant_test(0, 64)
173176

174-
model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath(f"matmul_fp32_{quant_type}.onnx").absolute())
175-
self.construct_model_matmul(model_fp32_path, quant_type)
176-
data_reader = self.input_feeds(1, {"input": [100, 52]})
177-
self.quant_test(model_fp32_path, data_reader, quant_type, 64)
177+
@unittest.skipIf(
178+
find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_bnb4"
179+
)
180+
def test_quantize_matmul_bnb4_nf4(self):
181+
np.random.seed(13)
182+
self.quant_test(1, 64)
178183

179184

180185
if __name__ == "__main__":

0 commit comments

Comments
 (0)