Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ def __init__(
# model for mtp/eagle/draft_model
self.model: Optional[str] = None
# quantization of model
self.quantization: Optional[str] = None
self.quantization: Optional[Dict[str, Any]] = None
# allocate more blocks to prevent mtp from finishing the block earlier than the main model
# Fixed now
self.num_gpu_block_expand_ratio: Optional[float] = 1
Expand Down
5 changes: 3 additions & 2 deletions fastdeploy/engine/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
DeprecatedOptionWarning,
FlexibleArgumentParser,
is_port_available,
parse_quantization,
)


Expand Down Expand Up @@ -137,7 +138,7 @@ class EngineArgs:
"""
dynamic load weight strategy
"""
quantization: str = None
quantization: Optional[Dict[str, Any]] = None
guided_decoding_backend: str = "off"
"""
Guided decoding backend.
Expand Down Expand Up @@ -538,7 +539,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
)
model_group.add_argument(
"--quantization",
type=str,
type=parse_quantization,
default=EngineArgs.quantization,
help="Quantization name for the model, currently support "
"'wint8', 'wint4',"
Expand Down
3 changes: 2 additions & 1 deletion fastdeploy/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from __future__ import annotations

import json
import multiprocessing
import os
import re
Expand Down Expand Up @@ -484,7 +485,7 @@ def _start_worker_service(self):
f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
f" --quantization {self.cfg.model_config.quantization}"
f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
f" --ori_vocab_size {ori_vocab_size}"
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
Expand Down
31 changes: 1 addition & 30 deletions fastdeploy/model_executor/layers/moe/ep.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,38 +28,9 @@

import fastdeploy
from fastdeploy.config import MoEPhase
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
from fastdeploy.utils import singleton

try:
from fastdeploy.model_executor.ops.gpu import noaux_tc
except:
logger.warning("import noaux_tc Failed!")


def get_moe_scores(
gating_output: paddle.Tensor,
n_group,
topk_group,
top_k,
routed_scaling_factor,
e_score_correction_bias,
) -> paddle.Tensor:
"""
compute moe scores using e_score_correction_bias.
"""
scores = paddle.nn.functional.sigmoid(gating_output)
assert e_score_correction_bias is not None, "e_score_correction_bias is none!"
scores_with_bias = scores + e_score_correction_bias
scores, topk_values, topk_idx = noaux_tc(
scores,
scores_with_bias,
n_group if n_group > 0 else 1,
topk_group if topk_group > 0 else 1,
top_k,
routed_scaling_factor,
)
return scores, topk_values, topk_idx


@singleton
class DeepEPEngine:
Expand Down
32 changes: 2 additions & 30 deletions fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,7 @@
from .fused_moe_backend_base import UnquantizedFusedMoEMethod

if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import (
moe_expert_dispatch,
moe_expert_reduce,
noaux_tc,
)
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch, moe_expert_reduce

try:
from fastdeploy.model_executor.ops.gpu import w4afp8_gemm_scale_permute
Expand All @@ -43,34 +39,10 @@
moe_expert_reduce,
)

from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs


# used for deepseek_v3
def get_moe_scores(
gating_output: paddle.Tensor,
n_group,
topk_group,
top_k,
routed_scaling_factor,
e_score_correction_bias,
) -> paddle.Tensor:
"""
compute moe scores using e_score_correction_bias.
"""
scores = paddle.nn.functional.sigmoid(gating_output)
scores_with_bias = scores + e_score_correction_bias
scores, topk_values, topk_idx = noaux_tc(
scores,
scores_with_bias,
n_group,
topk_group,
top_k,
routed_scaling_factor,
)
return scores, topk_values, topk_idx


class CutlassMoEMethod(UnquantizedFusedMoEMethod):
"""
Use Cutlass Group Gemm to compute Fused MoE.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def apply_tp(
gate_out = gate(x.cast("float32"))

if layer.topk_method == "noaux_tc":
from .ep import get_moe_scores
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores

_, topk_weights, topk_ids = get_moe_scores(
gate_out,
Expand Down
26 changes: 1 addition & 25 deletions fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,39 +19,15 @@

import fastdeploy
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
from fastdeploy.model_executor.ops.gpu import (
MoeWna16MarlinGemmApi,
noaux_tc,
tritonmoe_preprocess_func,
)

from ..quantization.quant_base import QuantMethodBase


def get_moe_scores(
gating_output: paddle.Tensor,
n_group,
topk_group,
top_k,
routed_scaling_factor,
e_score_correction_bias,
) -> paddle.Tensor:
"""
compute moe scores using e_score_correction_bias.
"""
scores = paddle.nn.functional.sigmoid(gating_output)
scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
scores, topk_values, topk_idx = noaux_tc(
scores,
scores_with_bias,
n_group,
topk_group,
top_k,
routed_scaling_factor,
)
return scores, topk_values, topk_idx


def gptq_marlin_moe_repack(
b_q_weight: paddle.Tensor,
perm: paddle.Tensor,
Expand Down
Loading
Loading