Tencent · RuBing-Yang · Aug 15, 2025 · Aug 15, 2025
diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-0_5b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-0_5b_instruct_fp8_dynamic.yaml
@@ -0,0 +1,27 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen2.5-0.5B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_dynamic     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-3b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-3b_instruct_fp8_dynamic.yaml
@@ -0,0 +1,26 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen2.5-3B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_dynamic     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-72b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-72b_instruct_fp8_dynamic.yaml
@@ -0,0 +1,26 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen2.5-72B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_dynamic     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
diff --git a/configs/qwen2_5/fp8_static/qwen2_5-0_5b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-0_5b_instruct_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen2.5-0.5B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_static     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
+  max_seq_length: 4096
+  num_samples: 256
+  batch_size: 1
diff --git a/...2_5/fp8_static/qwen2_5-7b_fp8_static.yaml → ...tatic/qwen2_5-3b_instruct_fp8_static.yaml b/...2_5/fp8_static/qwen2_5-7b_fp8_static.yaml → ...tatic/qwen2_5-3b_instruct_fp8_static.yaml
@@ -5,7 +5,7 @@ global:
 # Simplified Configuration for LLM compression
 model:
   name: Qwen
-  model_path: Qwen/Qwen2.5-7B-Instruct
+  model_path: Qwen/Qwen2.5-3B-Instruct
   trust_remote_code: true
   low_cpu_mem_usage: true
   use_cache: false
@@ -28,7 +28,7 @@ compression:
 # Dataset for calibration
 dataset:
   name: TextDataset
-  data_path: ./dataset/sharegpt/sharegpt_gpt4_512.json
+  data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
   max_seq_length: 4096
   num_samples: 256
   batch_size: 1
diff --git a/configs/qwen2_5/fp8_static/qwen2_5-72b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-72b_instruct_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen2.5-72B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_static     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
+  max_seq_length: 4096
+  num_samples: 256
+  batch_size: 1
diff --git a/configs/qwen2_5/fp8_static/qwen2_5-7b_fp8_static_low_memory.yaml b/configs/qwen2_5/fp8_static/qwen2_5-7b_fp8_static_low_memory.yaml
@@ -29,7 +29,7 @@ compression:
 # Dataset for calibration
 dataset:
   name: TextDataset
-  data_path: ./dataset/sharegpt/sharegpt_gpt4_512.json
+  data_path: ./dataset/sharegpt_gpt4_qwen/sharegpt_gpt4-qwen3_a22B_output.jsonl
   max_seq_length: 4096
   num_samples: 256
   batch_size: 1