Skip to content

Commit f3eae82

Browse files
rsmallbluechang-wenbin
authored andcommitted
fix Cfp8 for RL load (PaddlePaddle#4144)
1 parent 5c53001 commit f3eae82

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

fastdeploy/model_executor/layers/attention/attention.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
from paddleformers.utils.log import logger
2525

2626
from fastdeploy.config import FDConfig
27+
from fastdeploy.model_executor.layers.quantization.kv_cache import (
28+
KvCacheQuantzationTypes,
29+
)
2730
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
2831

2932
if TYPE_CHECKING:
@@ -107,6 +110,12 @@ def __init__(
107110

108111
if fd_config.quant_config and hasattr(fd_config.quant_config, "kv_cache_quant_type"):
109112
self.quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method(self)
113+
114+
# set for RL model, as RL do not need load state dict
115+
if fd_config.quant_config.kv_cache_quant_type == KvCacheQuantzationTypes.BLOCK_WISE_FP8:
116+
self.cache_quant_type_str = "block_wise_fp8"
117+
self.quant_max_bound = 448.0
118+
self.quant_min_bound = -448.0
110119
else:
111120
self.quant_method = None
112121

0 commit comments

Comments
 (0)