Skip to content

Commit c3b8ebe

Browse files
authored
[Optimize] Machete using group scale default (#4121)
1 parent 62b8b02 commit c3b8ebe

File tree

2 files changed

+15
-5
lines changed

2 files changed

+15
-5
lines changed

fastdeploy/model_executor/layers/quantization/weight_only.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
161161
and envs.FD_USE_MACHETE == "1"
162162
and layer.weight_shape[1]
163163
and layer.weight_shape[1] % 128 == 0
164-
and not layer.add_bias
165164
):
166165
return MacheteWeightOnlyLinearMethod(self)
167166
return GPUWeightOnlyLinearMethod(self)
@@ -244,7 +243,8 @@ def create_weights(self, layer, **extra_weight_attrs):
244243
)
245244
else:
246245
if isinstance(self, MacheteWeightOnlyLinearMethod):
247-
weight_scale_shape = [1, layer.weight_shape[1]]
246+
# Using group scale for machete, group size is 128
247+
weight_scale_shape = [(layer.weight_shape[0] + 127) // 128, layer.weight_shape[1]]
248248
if self.quant_config.name() == "wint4":
249249
layer.weight_shape[0] //= 8
250250
else:
@@ -299,10 +299,12 @@ def process_weights_after_loading(self, layer) -> None:
299299
machete_quantize_and_pack,
300300
)
301301

302+
# Using group scale for machete, group size is 128
302303
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
303304
w=layer.weight,
304305
atype=layer._dtype,
305306
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
307+
group_size=128,
306308
)
307309
else:
308310
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
@@ -404,23 +406,27 @@ def process_loaded_weights(self, layer, weight) -> None:
404406
machete_quantize_and_pack,
405407
)
406408

409+
# Using group scale for machete, group size is 128
407410
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
408411
w=weight,
409412
atype=layer._dtype,
410413
quant_type="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
414+
group_size=128,
411415
)
412416
layer.weight.set_value(quanted_weight_tensor)
413417
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
414418

415419
def apply(self, layer, x):
416-
assert layer.bias is None, "Machete weight only linear method does not support bias."
417420
from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm
418421

422+
# Using group scale for machete, group size is 128
419423
linear_out = machete_wint_mm(
420424
x,
421425
w_prepack=layer.weight,
422426
w_g_s=layer.weight_scale,
423427
weight_dtype="uint4b8" if self.quant_config.name() == "wint4" else "uint8b128",
428+
group_size=128,
424429
)
425-
430+
if layer.with_bias:
431+
linear_out = paddle.add(linear_out, layer.bias)
426432
return linear_out

tests/operators/test_machete_mm.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ def get_machete_weight_only_linear_out(self):
135135
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
136136
group_size=self.machete_group_size,
137137
)
138+
if self.bias is not None:
139+
out = paddle.add(out, self.bias)
138140
return out.numpy()
139141

140142
def test_weight_only_linear(self):
@@ -158,7 +160,7 @@ def config(self):
158160
self.dtype = "float16"
159161
self.rtol = 1e-5
160162
self.atol = 1e-1
161-
self.bias = False
163+
self.bias = True
162164
self.batch = 1
163165
self.token = 512
164166
self.in_features = 7168
@@ -224,6 +226,8 @@ def get_machete_weight_only_linear_out(self):
224226
weight_dtype="uint4b8" if self.weight_dtype == "int4" else "uint8b128", # weight_dtype
225227
group_size=self.machete_group_size,
226228
)
229+
if self.bias is not None:
230+
out = paddle.add(out, self.bias)
227231
return out.numpy()
228232

229233
def test_weight_only_linear(self):

0 commit comments

Comments
 (0)