Skip to content

Commit 2e22a82

Browse files
iosmersroot
authored andcommitted
[XPU]Support V1 loader in weight_only Model (PaddlePaddle#4808)
* support v1 loader in wint8 * code style * update --------- Co-authored-by: root <root@gajl-bbc-onlinec-com-1498356.gajl.baidu.com>
1 parent 0eae406 commit 2e22a82

File tree

2 files changed

+176
-21
lines changed

2 files changed

+176
-21
lines changed

fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,12 @@
2929
weight_quantize_xpu,
3030
xpu_moe_layer,
3131
)
32-
from fastdeploy.model_executor.utils import default_weight_loader, set_weight_attrs
32+
from fastdeploy.model_executor.utils import (
33+
TensorTracker,
34+
default_weight_loader,
35+
free_tensor,
36+
set_weight_attrs,
37+
)
3338

3439

3540
class XPUMoEMethod(MoEMethodBase):
@@ -62,15 +67,17 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
6267
"""
6368
create weight process.
6469
"""
65-
if layer.fd_config.load_config.load_choices == "default_v1" and self.moe_quant_type in ["w16a16"]:
70+
if layer.fd_config.load_config.load_choices == "default_v1" and self.moe_quant_type in [
71+
"w16a16",
72+
"weight_only_int8",
73+
"weight_only_int4",
74+
]:
6675
self.up_gate_proj_weight_shape = [
6776
layer.num_local_experts,
6877
layer.moe_intermediate_size * 2,
6978
layer.hidden_size,
7079
]
7180
self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
72-
extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
73-
7481
layer.up_gate_proj_weight = layer.create_parameter(
7582
shape=self.up_gate_proj_weight_shape,
7683
dtype=layer.weight_dtype,
@@ -86,18 +93,21 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
8693
set_weight_attrs(
8794
layer.up_gate_proj_weight,
8895
{
96+
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
8997
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
9098
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
99+
"tensor_track": TensorTracker(shape=layer.up_gate_proj_weight.shape, output_dim=False),
91100
},
92101
)
93102
set_weight_attrs(
94103
layer.down_proj_weight,
95104
{
105+
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
96106
"weight_loader": extra_weight_attrs.get("weight_loader", default_weight_loader(layer.fd_config)),
97107
"weight_need_transpose": extra_weight_attrs.get("model_format") == "torch",
108+
"tensor_track": TensorTracker(shape=layer.down_proj_weight.shape, output_dim=True),
98109
},
99110
)
100-
101111
if layer.with_bias:
102112
layer.up_gate_proj_bias = layer.create_parameter(
103113
shape=[layer.num_experts, layer.moe_intermediate_size * 2],
@@ -128,6 +138,15 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
128138
"model_format": extra_weight_attrs.get("model_format", ""),
129139
},
130140
)
141+
if self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]:
142+
self.up_gate_proj_scale_shape = [
143+
layer.num_local_experts,
144+
layer.moe_intermediate_size * 2,
145+
]
146+
self.down_proj_scale_shape = [
147+
layer.num_local_experts,
148+
layer.hidden_size,
149+
]
131150

132151
else:
133152
self.up_gate_proj_weight_shape = [
@@ -531,6 +550,87 @@ def process_loaded_weights(self, layer: nn.Layer, state_dict):
531550
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
532551
getattr(layer, scale_name).set_value(quanted_weight_scale)
533552

553+
def process_weights_after_loading(self, layer):
554+
""" """
555+
if not self.quant_config.is_checkpoint_bf16:
556+
return
557+
weight_id_map = {"gate_up": 0, "down": 1}
558+
if (
559+
hasattr(layer.up_gate_proj_weight, "tensor_track")
560+
and layer.up_gate_proj_weight.tensor_track is not None
561+
and layer.up_gate_proj_weight.tensor_track.is_fully_copied()
562+
):
563+
weight_type = "gate_up"
564+
else:
565+
weight_type = "down"
566+
567+
# 1.init shape and type
568+
# weight
569+
weight_name = self.added_weight_attrs[weight_id_map[weight_type]]
570+
unquantized_weight_name = weight_name.replace("quant_weight", "weight")
571+
if weight_type == "gate_up":
572+
weight_shape = [
573+
layer.num_local_experts,
574+
layer.moe_intermediate_size * 2,
575+
layer.hidden_size,
576+
]
577+
else:
578+
weight_shape = [
579+
layer.num_local_experts,
580+
layer.hidden_size,
581+
layer.moe_intermediate_size,
582+
]
583+
weight_dtype = "int8"
584+
# scale
585+
scale_name = self.added_scale_attrs[weight_id_map[weight_type]]
586+
scale_shape = self.up_gate_proj_scale_shape if weight_type == "gate_up" else self.down_proj_scale_shape
587+
if self.moe_quant_type in ["weight_only_int4"]:
588+
weight_shape[-1] //= 2
589+
scale_dtype = "float32"
590+
591+
# 2.crate tmp tensor
592+
593+
# weight = paddle.empty(weight_shape, dtype=weight_dtype)
594+
# scale = paddle.empty(scale_shape, dtype=scale_dtype)
595+
596+
# 3.quantize weight
597+
weight_list = []
598+
weight_scale_list = []
599+
for expert_id in range(layer.num_local_experts):
600+
quant_weight, scale = weight_quantize_xpu(
601+
getattr(layer, unquantized_weight_name)[expert_id].transpose([1, 0]), self.moe_quant_type, -1, -1
602+
)
603+
weight_list.append(quant_weight.transpose([1, 0]))
604+
weight_scale_list.append(scale)
605+
quanted_weight = paddle.stack(weight_list, axis=0)
606+
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
607+
608+
free_tensor(getattr(layer, unquantized_weight_name))
609+
610+
# create weight
611+
setattr(
612+
layer,
613+
weight_name,
614+
layer.create_parameter(
615+
shape=weight_shape,
616+
dtype=weight_dtype,
617+
default_initializer=paddle.nn.initializer.Constant(0),
618+
),
619+
)
620+
# create scale
621+
setattr(
622+
layer,
623+
scale_name,
624+
layer.create_parameter(
625+
shape=scale_shape,
626+
dtype=scale_dtype,
627+
default_initializer=paddle.nn.initializer.Constant(0),
628+
),
629+
)
630+
631+
getattr(layer, weight_name).set_value(quanted_weight)
632+
getattr(layer, scale_name).set_value(quanted_weight_scale)
633+
534634

535635
class XPUW4A8MoEMethod(XPUMoEMethod):
536636
"""

fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py

Lines changed: 71 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,17 @@
1717
import paddle
1818
from paddle import nn
1919

20+
from fastdeploy.model_executor.layers.linear import (
21+
MergedColumnParallelLinear,
22+
MergedReplicatedLinear,
23+
QKVParallelLinear,
24+
)
2025
from fastdeploy.model_executor.layers.quantization.weight_only import (
2126
WeightOnlyConfig,
2227
WeightOnlyLinearMethod,
2328
)
2429
from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu
30+
from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
2531

2632

2733
class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod):
@@ -41,22 +47,48 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs) -> None:
4147
Create weights for linear layer on XPU
4248
"""
4349
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
44-
weight_scale_shape = [layer.weight_shape[1]]
45-
layer.weight_shape.reverse()
46-
if self.quant_config.name() == "weight_only_int4":
47-
layer.weight_shape[0] //= 2
48-
layer.weight_dtype = "int8"
49-
layer.weight = layer.create_parameter(
50-
shape=layer.weight_shape,
51-
dtype=layer.weight_dtype,
52-
is_bias=False,
53-
default_initializer=paddle.nn.initializer.Constant(0),
54-
)
55-
layer.weight_scale = layer.create_parameter(
56-
shape=weight_scale_shape,
57-
dtype="float32",
58-
is_bias=False,
59-
)
50+
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
51+
layer.weight = layer.create_parameter(
52+
shape=layer.weight_shape,
53+
dtype=layer.weight_dtype,
54+
is_bias=False,
55+
default_initializer=paddle.nn.initializer.Constant(0),
56+
)
57+
extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
58+
quant_attrs = extra_weight_attrs
59+
if (
60+
isinstance(layer, MergedColumnParallelLinear)
61+
or isinstance(layer, QKVParallelLinear)
62+
or isinstance(layer, MergedReplicatedLinear)
63+
):
64+
quant_attrs = {
65+
**extra_weight_attrs,
66+
"tensor_track": TensorTracker(
67+
shape=layer.weight_shape, output_dim=extra_weight_attrs.get("output_dim", True)
68+
),
69+
}
70+
set_weight_attrs(
71+
layer.weight,
72+
quant_attrs,
73+
)
74+
else:
75+
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
76+
weight_scale_shape = [layer.weight_shape[1]]
77+
layer.weight_shape.reverse()
78+
if self.quant_config.name() == "weight_only_int4":
79+
layer.weight_shape[0] //= 2
80+
layer.weight_dtype = "int8"
81+
layer.weight = layer.create_parameter(
82+
shape=layer.weight_shape,
83+
dtype=layer.weight_dtype,
84+
is_bias=False,
85+
default_initializer=paddle.nn.initializer.Constant(0),
86+
)
87+
layer.weight_scale = layer.create_parameter(
88+
shape=weight_scale_shape,
89+
dtype="float32",
90+
is_bias=False,
91+
)
6092

6193
def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
6294
"""
@@ -76,3 +108,26 @@ def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None
76108
weight_scale_tensor = paddle.concat(weight_scale_tensors, axis=0)
77109
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
78110
layer.weight_scale.set_value(weight_scale_tensor)
111+
112+
def process_weights_after_loading(self, layer) -> None:
113+
if not self.quant_config.is_checkpoint_bf16:
114+
return
115+
116+
quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(layer.weight, self.quant_config.algo, -1, -1)
117+
118+
free_tensor(layer.weight)
119+
120+
layer.weight = layer.create_parameter(
121+
shape=quanted_weight_tensor.shape[::-1],
122+
dtype="int8",
123+
is_bias=False,
124+
default_initializer=paddle.nn.initializer.Constant(0),
125+
)
126+
layer.weight_scale = layer.create_parameter(
127+
shape=weight_scale_tensor.shape,
128+
dtype=weight_scale_tensor.dtype,
129+
is_bias=False,
130+
default_initializer=paddle.nn.initializer.Constant(0),
131+
)
132+
layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0]))
133+
layer.weight_scale.copy_(weight_scale_tensor, False)

0 commit comments

Comments
 (0)