-
Notifications
You must be signed in to change notification settings - Fork 31.7k
Open
Description
System Info
transformersversion: 5.0.0.dev0- Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.31
- Python version: 3.10.15
- Huggingface_hub version: 1.1.5
- Safetensors version: 0.4.5
- Accelerate version: 1.9.0
- Accelerate config: not found
- DeepSpeed version: 0.16.2
- PyTorch version (accelerator?): 2.9.0+cu128 (CUDA)
- Using distributed or parallel set-up in script?:
- Using GPU in script?:
- GPU type: NVIDIA A800-SXM4-80GB
Who can help?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examplesfolder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
This is my test code:
import torch
from transformers import AutoModelForCausalLM
from peft import PeftModel
model_name_or_path = "Qwen/Qwen3-30B-A3B"
qwen3moe_lora_files = "/path/to/lora_model"
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
).eval()
print(model)
model = PeftModel.from_pretrained(model, qwen3moe_lora_files)
print(model)The results printed by the LoRA model are as follows:
PeftModelForCausalLM(
(base_model): LoraModel(
(model): Qwen3MoeForCausalLM(
(model): Qwen3MoeModel(
(embed_tokens): Embedding(151936, 2048)
(layers): ModuleList(
(0-47): 48 x Qwen3MoeDecoderLayer(
(self_attn): Qwen3MoeAttention(
(q_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=4096, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=4096, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(k_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=512, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=512, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(o_proj): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=2048, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
(k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
)
(mlp): Qwen3MoeSparseMoeBlock(
(experts): Qwen3MoeExperts(
(act_fn): SiLUActivation()
)
(router): Qwen3MoeTopKRouter()
)
(input_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
(post_attention_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
)
)
(norm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
(rotary_emb): Qwen3MoeRotaryEmbedding()
)
(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
)
)
)
The LoRA part of MoE has not been correctly loaded.
Expected behavior
This is the result of the same code on transformers 4.57.3:
PeftModelForCausalLM(
(base_model): LoraModel(
(model): Qwen3MoeForCausalLM(
(model): Qwen3MoeModel(
(embed_tokens): Embedding(151936, 2048)
(layers): ModuleList(
(0-47): 48 x Qwen3MoeDecoderLayer(
(self_attn): Qwen3MoeAttention(
(q_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=4096, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=4096, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(k_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=512, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=512, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=512, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(o_proj): lora.Linear(
(base_layer): Linear(in_features=4096, out_features=2048, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
(k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
)
(mlp): Qwen3MoeSparseMoeBlock(
(gate): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=128, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=128, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(experts): ModuleList(
(0-127): 128 x Qwen3MoeMLP(
(gate_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=768, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=768, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(up_proj): lora.Linear(
(base_layer): Linear(in_features=2048, out_features=768, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=2048, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=768, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(down_proj): lora.Linear(
(base_layer): Linear(in_features=768, out_features=2048, bias=False)
(lora_dropout): ModuleDict(
(default): Identity()
)
(lora_A): ModuleDict(
(default): Linear(in_features=768, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(act_fn): SiLUActivation()
)
)
)
(input_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
(post_attention_layernorm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
)
)
(norm): Qwen3MoeRMSNorm((2048,), eps=1e-06)
(rotary_emb): Qwen3MoeRotaryEmbedding()
)
(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
)
)
)
After upgrading to transformers 5.x, can the previously trained LoRA models still be used? Does this mean that old models need to be retrained according to transformers 5.x?