@@ -161,7 +161,6 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
161
161
and envs .FD_USE_MACHETE == "1"
162
162
and layer .weight_shape [1 ]
163
163
and layer .weight_shape [1 ] % 128 == 0
164
- and not layer .add_bias
165
164
):
166
165
return MacheteWeightOnlyLinearMethod (self )
167
166
return GPUWeightOnlyLinearMethod (self )
@@ -244,7 +243,8 @@ def create_weights(self, layer, **extra_weight_attrs):
244
243
)
245
244
else :
246
245
if isinstance (self , MacheteWeightOnlyLinearMethod ):
247
- weight_scale_shape = [1 , layer .weight_shape [1 ]]
246
+ # Using group scale for machete, group size is 128
247
+ weight_scale_shape = [(layer .weight_shape [0 ] + 127 ) // 128 , layer .weight_shape [1 ]]
248
248
if self .quant_config .name () == "wint4" :
249
249
layer .weight_shape [0 ] //= 8
250
250
else :
@@ -299,10 +299,12 @@ def process_weights_after_loading(self, layer) -> None:
299
299
machete_quantize_and_pack ,
300
300
)
301
301
302
+ # Using group scale for machete, group size is 128
302
303
quanted_weight_tensor , weight_scale_tensor = machete_quantize_and_pack (
303
304
w = layer .weight ,
304
305
atype = layer ._dtype ,
305
306
quant_type = "uint4b8" if self .quant_config .name () == "wint4" else "uint8b128" ,
307
+ group_size = 128 ,
306
308
)
307
309
else :
308
310
quanted_weight_tensor , weight_scale_tensor = weight_quantize (
@@ -404,23 +406,27 @@ def process_loaded_weights(self, layer, weight) -> None:
404
406
machete_quantize_and_pack ,
405
407
)
406
408
409
+ # Using group scale for machete, group size is 128
407
410
quanted_weight_tensor , weight_scale_tensor = machete_quantize_and_pack (
408
411
w = weight ,
409
412
atype = layer ._dtype ,
410
413
quant_type = "uint4b8" if self .quant_config .name () == "wint4" else "uint8b128" ,
414
+ group_size = 128 ,
411
415
)
412
416
layer .weight .set_value (quanted_weight_tensor )
413
417
layer .weight_scale .set_value (weight_scale_tensor .astype (paddle .get_default_dtype ()))
414
418
415
419
def apply (self , layer , x ):
416
- assert layer .bias is None , "Machete weight only linear method does not support bias."
417
420
from fastdeploy .model_executor .layers .quantization .ops import machete_wint_mm
418
421
422
+ # Using group scale for machete, group size is 128
419
423
linear_out = machete_wint_mm (
420
424
x ,
421
425
w_prepack = layer .weight ,
422
426
w_g_s = layer .weight_scale ,
423
427
weight_dtype = "uint4b8" if self .quant_config .name () == "wint4" else "uint8b128" ,
428
+ group_size = 128 ,
424
429
)
425
-
430
+ if layer .with_bias :
431
+ linear_out = paddle .add (linear_out , layer .bias )
426
432
return linear_out
0 commit comments