Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions configs/qwen2_5/fp8_dynamic/qwen2_5-0_5b_instruct_fp8_dynamic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Global configuration of pipeline
global:
save_path: ./output

# Simplified Configuration for LLM compression
model:
name: Qwen
model_path: Qwen/Qwen2.5-0.5B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
torch_dtype: auto
device_map: auto

# Compression configuration
compression:
name: PTQ
quantization:
name: fp8_dynamic # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
bits: 8 # Quantization bits (4/8)
quant_method:
weight: "per-tensor"
activation: "per-tensor"
ignore_layers: # Skip quantization for these layers
- "lm_head"
- "model.embed_tokens"

26 changes: 26 additions & 0 deletions configs/qwen2_5/fp8_dynamic/qwen2_5-3b_instruct_fp8_dynamic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Global configuration of pipeline
global:
save_path: ./output

# Simplified Configuration for LLM compression
model:
name: Qwen
model_path: Qwen/Qwen2.5-3B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
torch_dtype: auto
device_map: auto

# Compression configuration
compression:
name: PTQ
quantization:
name: fp8_dynamic # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
bits: 8 # Quantization bits (4/8)
quant_method:
weight: "per-tensor"
activation: "per-tensor"
ignore_layers: # Skip quantization for these layers
- "lm_head"
- "model.embed_tokens"
26 changes: 26 additions & 0 deletions configs/qwen2_5/fp8_dynamic/qwen2_5-72b_instruct_fp8_dynamic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Global configuration of pipeline
global:
save_path: ./output

# Simplified Configuration for LLM compression
model:
name: Qwen
model_path: Qwen/Qwen2.5-72B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
torch_dtype: auto
device_map: auto

# Compression configuration
compression:
name: PTQ
quantization:
name: fp8_dynamic # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
bits: 8 # Quantization bits (4/8)
quant_method:
weight: "per-tensor"
activation: "per-tensor"
ignore_layers: # Skip quantization for these layers
- "lm_head"
- "model.embed_tokens"
34 changes: 34 additions & 0 deletions configs/qwen2_5/fp8_static/qwen2_5-0_5b_instruct_fp8_static.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Global configuration of pipeline
global:
save_path: ./output

# Simplified Configuration for LLM compression
model:
name: Qwen
model_path: Qwen/Qwen2.5-0.5B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
torch_dtype: auto
device_map: auto

# Compression configuration
compression:
name: PTQ
quantization:
name: fp8_static # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
bits: 8 # Quantization bits (4/8)
quant_method:
weight: "per-tensor"
activation: "per-tensor"
ignore_layers: # Skip quantization for these layers
- "lm_head"
- "model.embed_tokens"

# Dataset for calibration
dataset:
name: TextDataset
data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
max_seq_length: 4096
num_samples: 256
batch_size: 1
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ global:
# Simplified Configuration for LLM compression
model:
name: Qwen
model_path: Qwen/Qwen2.5-7B-Instruct
model_path: Qwen/Qwen2.5-3B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
Expand All @@ -28,7 +28,7 @@ compression:
# Dataset for calibration
dataset:
name: TextDataset
data_path: ./dataset/sharegpt/sharegpt_gpt4_512.json
data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
max_seq_length: 4096
num_samples: 256
batch_size: 1
34 changes: 34 additions & 0 deletions configs/qwen2_5/fp8_static/qwen2_5-72b_instruct_fp8_static.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Global configuration of pipeline
global:
save_path: ./output

# Simplified Configuration for LLM compression
model:
name: Qwen
model_path: Qwen/Qwen2.5-72B-Instruct
trust_remote_code: true
low_cpu_mem_usage: true
use_cache: false
torch_dtype: auto
device_map: auto

# Compression configuration
compression:
name: PTQ
quantization:
name: fp8_static # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
bits: 8 # Quantization bits (4/8)
quant_method:
weight: "per-tensor"
activation: "per-tensor"
ignore_layers: # Skip quantization for these layers
- "lm_head"
- "model.embed_tokens"

# Dataset for calibration
dataset:
name: TextDataset
data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
max_seq_length: 4096
num_samples: 256
batch_size: 1
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ compression:
# Dataset for calibration
dataset:
name: TextDataset
data_path: ./dataset/sharegpt/sharegpt_gpt4_512.json
data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
max_seq_length: 4096
num_samples: 256
batch_size: 1