From 700d3c1b9f94e1c60ac35b8e9f2edb7bef45e655 Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <592045536@qq.com> Date: Thu, 4 Sep 2025 21:01:03 +0800 Subject: [PATCH 1/4] support machete wint8 and group scale --- custom_ops/gpu_ops/machete/machete_mm.cu | 6 +- .../gpu_ops/machete/machete_prepack_B.cu | 2 + .../layers/quantization/ops/machete_mm.py | 19 ++- tests/operators/test_machete_mm.py | 128 ++++++++++++++---- 4 files changed, 118 insertions(+), 37 deletions(-) diff --git a/custom_ops/gpu_ops/machete/machete_mm.cu b/custom_ops/gpu_ops/machete/machete_mm.cu index 53774fa0c91..c6f56d1c9f2 100644 --- a/custom_ops/gpu_ops/machete/machete_mm.cu +++ b/custom_ops/gpu_ops/machete/machete_mm.cu @@ -30,10 +30,12 @@ paddle::Tensor mm(paddle::Tensor const& A, paddle::Tensor const& B, std::optional const& maybe_token_scales, std::string maybe_schedule) { machete::ScalarType const b_type = machete::ScalarType::from_id(b_type_id); - std::optional maybe_group_size_opt; + std::optional maybe_group_size_opt = std::optional(maybe_group_size); std::optional maybe_schedule_opt; if (maybe_schedule == "") { maybe_schedule_opt = std::nullopt; + } else { + maybe_schedule_opt = std::optional(maybe_schedule); } return machete::mm_dispatch({.A = A, .B = B, @@ -63,6 +65,8 @@ std::vector MacheteMMKernel( paddle::DataType maybe_out_type; if (b_type_str == "uint4b8") { b_type_id = machete::kU4B8.id(); + } else if (b_type_str == "uint8b128") { + b_type_id = machete::kU8B128.id(); } else { PADDLE_ENFORCE(false, "b_type_str not supported!"); } diff --git a/custom_ops/gpu_ops/machete/machete_prepack_B.cu b/custom_ops/gpu_ops/machete/machete_prepack_B.cu index 6014ca9ef50..34bd1c70561 100644 --- a/custom_ops/gpu_ops/machete/machete_prepack_B.cu +++ b/custom_ops/gpu_ops/machete/machete_prepack_B.cu @@ -51,6 +51,8 @@ std::vector MachetePrepackBKernel( if (b_type_str == "uint4b8") { b_type_id = machete::kU4B8.id(); + } else if (b_type_str == "uint8b128") { + b_type_id = machete::kU8B128.id(); } else { PADDLE_ENFORCE(false, "b_type_str not supported!"); } diff --git a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py index 218da0d2182..b080bb627c5 100644 --- a/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py +++ b/fastdeploy/model_executor/layers/quantization/ops/machete_mm.py @@ -85,7 +85,7 @@ def quantize_weights( w_s: Scales (None if `group_size` is None). """ assert paddle.is_floating_point(w), "w must be float type" - assert quant_type in ["uint4", "uint4b8"], "only support quant_type = uint4, uint4b8" + assert quant_type in ["uint4b8", "uint8b128"], "only support quant_type = uint4b8, uint8b128" orig_device = w.place size_k, size_n = w.shape @@ -103,8 +103,12 @@ def quantize_weights( max_val = paddle.max(w, axis=0, keepdim=True) min_val = paddle.min(w, axis=0, keepdim=True) - max_q_val = float(7.0) - min_q_val = float(-8.0) + if quant_type == "uint4b8": + max_q_val = float(7.0) + min_q_val = float(-8.0) + else: + max_q_val = float(127.0) + min_q_val = float(-128.0) w_s = paddle.ones([1], dtype=paddle.float32) # unscaled case @@ -124,6 +128,8 @@ def quantize_weights( # w_q += quant_type.bias if quant_type == "uint4b8": w_q += 8 + else: + w_q += 128 # Restore original shapes if group_size is not None and group_size < size_k: @@ -131,11 +137,11 @@ def quantize_weights( def reshape_w(w_tensor): w_tensor = w_tensor.reshape([group_size, -1, size_n]) w_tensor = w_tensor.transpose([1, 0, 2]) - w_tensor = w_tensor.reshape([size_k, size_n]) + w_tensor = w_tensor.reshape([size_k, size_n]).contiguous() return w_tensor w_q = reshape_w(w_q) - w_s = w_s.reshape([-1, size_n]) + w_s = w_s.reshape([-1, size_n]).contiguous() # Move tensors back to original device w_q = w_q.to(orig_device) @@ -153,7 +159,8 @@ def machete_quantize_and_pack( group_size: int = -1, ): w_q, w_s = quantize_weights(w, group_size, quant_type=quant_type) - w_q = pack_rows(w_q, 4, *w_q.shape) + num_bits = 4 if quant_type == "uint4b8" else 8 + w_q = pack_rows(w_q, num_bits, *w_q.shape) w_q_col = w_q.transpose([1, 0]).contiguous() # convert to col major w_q_prepack = machete_prepack_B( w_q_col, diff --git a/tests/operators/test_machete_mm.py b/tests/operators/test_machete_mm.py index 117fd792826..fafdf717d89 100644 --- a/tests/operators/test_machete_mm.py +++ b/tests/operators/test_machete_mm.py @@ -64,11 +64,11 @@ def convert_uint16_to_float(in_list): not core.is_compiled_with_cuda() or get_sm_version() < 90, "machete only support sm90.", ) -class WeightOnlyLinearTestCase(unittest.TestCase): +class WeightOnlyInt4LinearTestCase(unittest.TestCase): def config(self): self.dtype = "float16" self.rtol = 1e-5 - self.atol = 1e-2 + self.atol = 1.3e-1 self.bias = False self.batch = 1 self.token = 512 @@ -77,11 +77,10 @@ def config(self): self.weight_dtype = "int4" self.static = False self.group_size = -1 + self.machete_group_size = -1 def setUp(self): self.config() - if self.dtype == "bfloat16" or self.weight_dtype == "int4": - self.atol = 1.3e-1 x = np.random.random((self.token, self.in_features)) self.x = paddle.to_tensor(x, dtype=self.dtype) if self.bias: @@ -111,29 +110,30 @@ def get_linear_out(self): return out.numpy() def get_weight_only_linear_out(self): - for i in range(10): - out = Q.weight_only_linear( - self.x, - self.weight, - bias=self.bias, - weight_scale=self.weight_scale, - weight_dtype=self.weight_dtype, - group_size=self.group_size, - ) + out = Q.weight_only_linear( + self.x, + self.weight, + bias=self.bias, + weight_scale=self.weight_scale, + weight_dtype=self.weight_dtype, + group_size=self.group_size, + ) return out.numpy() def get_machete_weight_only_linear_out(self): w_q, w_s = machete_quantize_and_pack( w=self.float_weight.cuda(), atype=self.dtype, - quant_type="uint4b8", + quant_type="uint4b8" if self.weight_dtype == "int4" else "uint8b128", + group_size=self.machete_group_size, ) out = machete_wint_mm( self.x, w_prepack=w_q, w_g_s=w_s, # group scales - weight_dtype="uint4b8", # weight_dtype + weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype + group_size=self.machete_group_size, ) return out.numpy() @@ -149,26 +149,94 @@ def test_weight_only_linear(self): np.testing.assert_allclose(out_paddle, out_machete, rtol=self.rtol, atol=self.atol) -M = [32, 128] -K_N = [[2048, 4096]] +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_sm_version() < 90, + "machete only support sm90.", +) +class WeightOnlyInt8LinearTestCase(unittest.TestCase): + def config(self): + self.dtype = "float16" + self.rtol = 1e-5 + self.atol = 1e-1 + self.bias = False + self.batch = 1 + self.token = 512 + self.in_features = 7168 + self.out_features = 1024 + self.weight_dtype = "int8" + self.static = False + self.group_size = -1 + self.machete_group_size = 128 + def setUp(self): + self.config() + x = np.random.random((self.token, self.in_features)) + self.x = paddle.to_tensor(x, dtype=self.dtype) + if self.bias: + bias_attr = base.ParamAttr( + trainable=False, + regularizer=None, + initializer=paddle.nn.initializer.Constant(value=1.0), + ) + else: + bias_attr = None + set_default_dtype(self.dtype) + self.linear = paddle.nn.Linear(self.in_features, self.out_features, bias_attr=bias_attr) -def make_case(m, k, n): - class Case(WeightOnlyLinearTestCase): - def config(self, _m=m, _k=k, _n=n): - super().config() - self.token = m - self.in_features = k - self.out_features = n + self.bias = self.linear.bias + self.weight = self.linear.weight + self.float_weight = self.linear.weight + self.weight_scale = None + + self.weight, self.weight_scale = Q.weight_quantize( + (self.float_weight.cuda() if self.weight_dtype == "int8" else self.weight.cpu()), + algo=("weight_only_int8" if self.weight_dtype == "int8" else "weight_only_int4"), + group_size=self.group_size, + ) + + def get_linear_out(self): + out = self.linear(self.x) + return out.numpy() + + def get_weight_only_linear_out(self): + out = Q.weight_only_linear( + self.x, + self.weight, + bias=self.bias, + weight_scale=self.weight_scale, + weight_dtype=self.weight_dtype, + group_size=self.group_size, + ) + return out.numpy() - Case.name = f"WeightOnlyLinearTestCase{m}{k}{n}" - return Case + def get_machete_weight_only_linear_out(self): + w_q, w_s = machete_quantize_and_pack( + w=self.float_weight.cuda(), + atype=self.dtype, + quant_type="uint4b8" if self.weight_dtype == "int4" else "uint8b128", + group_size=self.machete_group_size, + ) + out = machete_wint_mm( + self.x, + w_prepack=w_q, + w_g_s=w_s, # group scales + weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype + group_size=self.machete_group_size, + ) + return out.numpy() + + def test_weight_only_linear(self): + out_expect = self.get_linear_out() + # out_paddle = self.get_weight_only_linear_out() + out_machete = self.get_machete_weight_only_linear_out() + + if self.dtype == "bfloat16": + # out_paddle = convert_uint16_to_float(out_paddle) + out_expect = convert_uint16_to_float(out_expect) + out_machete = convert_uint16_to_float(out_machete) + np.testing.assert_allclose(out_expect, out_machete, rtol=self.rtol, atol=self.atol) -for k, n in K_N: - for m in M: - cls = make_case(m, k, n) - globals()[cls.name] = cls if __name__ == "__main__": unittest.main() From 00fb14ec319052d13ae47191db9453a30cf68623 Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <592045536@qq.com> Date: Tue, 9 Sep 2025 16:13:47 +0800 Subject: [PATCH 2/4] support machete wint8 model --- .../layers/quantization/weight_only.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index ac77f15f3a8..d4134712dcf 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -142,8 +142,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: ) if ( - self.name() == "wint4" - and _ENABLE_MACHETE + _ENABLE_MACHETE and envs.FD_USE_MACHETE == "1" and layer.weight_shape[1] and layer.weight_shape[1] % 128 == 0 @@ -371,14 +370,11 @@ def __init__( def create_weights(self, layer, **extra_weight_attrs): assert layer.bias is None, "Machete weight only linear method does not support bias." - assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4." - - # The scale shape should be equal to the output dim of weight using Per-Channel Quantization. weight_scale_shape = [1, layer.weight_shape[1]] - - # layer.weight_shape.reverse() if self.quant_config.name() == "wint4": layer.weight_shape[0] //= 8 + else: + layer.weight_shape[0] //= 4 layer.weight_dtype = "int32" layer.weight = layer.create_parameter( @@ -405,21 +401,20 @@ def process_loaded_weights(self, layer, weight) -> None: quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack( w=weight, atype=layer._dtype, - quant_type="uint4b8", + quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", ) layer.weight.set_value(quanted_weight_tensor) layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype())) def apply(self, layer, x): assert layer.bias is None, "Machete weight only linear method does not support bias." - assert self.quant_config.name() == "wint4", "Machete weight only linear method only supports wint4." from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm linear_out = machete_wint_mm( x, w_prepack=layer.weight, w_g_s=layer.weight_scale, - weight_dtype="uint4b8", + weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128", ) return linear_out From 7048df6d5c3945aeefa6b5cd10c299d7ae73a879 Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <592045536@qq.com> Date: Fri, 12 Sep 2025 11:42:06 +0800 Subject: [PATCH 3/4] fix bias --- fastdeploy/model_executor/layers/quantization/weight_only.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index a5d3a4c4f6e..97b9dc7c7bf 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -146,6 +146,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: and envs.FD_USE_MACHETE == "1" and layer.weight_shape[1] and layer.weight_shape[1] % 128 == 0 + and layer.bias is None ): return MacheteWeightOnlyLinearMethod(self) return GPUWeightOnlyLinearMethod(self) From f6b9b4da3befd68f283fe6835234747374eb0e79 Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <592045536@qq.com> Date: Fri, 12 Sep 2025 17:36:49 +0800 Subject: [PATCH 4/4] fix --- fastdeploy/model_executor/layers/quantization/weight_only.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index 609ad3f458a..0d56491a134 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -146,7 +146,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: and envs.FD_USE_MACHETE == "1" and layer.weight_shape[1] and layer.weight_shape[1] % 128 == 0 - and layer.bias is None + and not layer.add_bias ): return MacheteWeightOnlyLinearMethod(self) return GPUWeightOnlyLinearMethod(self)