verl-project · FightingZhen · Dec 19, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 12, 2025
@@ -46,14 +46,10 @@ Use parameters in each role's ``profiler.tool_config.npu`` to control npu profil
 -  level: Collection level—options are level_none, level0, level1, and
    level2
 
-   -  level_none: Disables all level-based data collection (turns off
-      profiler_level).
-   -  level0: Collect high-level application data, underlying NPU data,
-      and operator execution details on NPU.
-   -  level1: Extends level0 by adding CANN-layer AscendCL data and AI
-      Core performance metrics on NPU.
-   -  level2: Extends level1 by adding CANN-layer Runtime data and AI
-      CPU metrics.
+   -  level_none: Disables all level-based data collection (turns off profiler_level).
+   -  level0: Collect high-level application data, underlying NPU data, and operator execution details on NPU. After balancing data volume and analytical capability, Level 0 is recommended as the default configuration.
+   -  level1: Extends level0 by adding CANN-layer AscendCL data and AI Core performance metrics on NPU.
+   -  level2: Extends level1 by adding CANN-layer Runtime data and AI CPU metrics.
 
 -  contents: A list of options to control the collection content, such as
    npu, cpu, memory, shapes, module, stack.
@@ -62,8 +58,7 @@ Use parameters in each role's ``profiler.tool_config.npu`` to control npu profil
    -  cpu: Whether to collect host-side performance data.
    -  memory: Whether to enable memory analysis.
    -  shapes: Whether to record tensor shapes.
-   -  module: Whether to record framework-layer Python call stack
-      information.
+   -  module: Whether to record framework-layer Python call stack information. It is recommended to use 'module' instead of 'stack' for recording call stack information, as it costs less performance overhead.
    -  stack: Whether to record operator call stack information.
 
 -  analysis: Enables automatic data parsing.

@@ -41,7 +41,7 @@ Last updated: 08/14/2025.
 -  level: 采集级别—选项有 level_none、level0、level1 和 level2
 
    -  level_none: 禁用所有基于级别的数据采集（关闭 profiler_level）。
-   -  level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。
+   -  level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。在权衡数据量和分析能力后，level0是推荐的默认配置。
    -  level1: 在level0基础上增加CANN层AscendCL数据和NPU上的AI Core性能指标。
    -  level2: 在level1基础上增加CANN层Runtime数据和AI CPU指标。
 
@@ -52,7 +52,7 @@ Last updated: 08/14/2025.
    -  cpu: 是否采集主机端性能数据。
    -  memory: 是否启用内存分析。
    -  shapes: 是否记录张量形状。
-   -  module: 是否记录框架层Python调用栈信息。
+   -  module: 是否记录框架层Python调用栈信息。相较于stack，更推荐使用module记录调用栈信息，因其产生的性能膨胀更低。
    -  stack: 是否记录算子调用栈信息。
 
 -  analysis: 启用自动数据解析。

@@ -8,7 +8,7 @@ PROFILE_RANKS="[1,2]"
 
 # profiling NPU options
 SAVE_PATH="$HOME/profile_data"
-LEVEL="level1"
+LEVEL="level0"
 CONTENTS=['npu','cpu']
 ANALYSIS=True
 

@@ -7,7 +7,7 @@ DISCRETE=False
 
 # profiling NPU options
 SAVE_PATH="$HOME/profile_data"
-LEVEL="level1"
+LEVEL="level0"
 CONTENTS=['npu','cpu']
 ANALYSIS=True
 

diff --git a/recipe/gkd/config/on_policy_distill_trainer.yaml b/recipe/gkd/config/on_policy_distill_trainer.yaml
@@ -158,7 +158,7 @@ actor_rollout_ref:
           contents: []
 
           # Collection level, optional values: level_none, level0, level1, level2.
-          level: "level1"
+          level: "level0"
 
           # Whether to automatically parse the data.
           analysis: True

@@ -456,7 +456,7 @@ trainer:
     options:
       save_path: ./profiler_data
       roles: ["all"]
-      level: level1
+      level: level0
       with_memory: False
       record_shapes: False
       with_npu: True

@@ -1015,7 +1015,7 @@ trainer:
       roles: ["all"]
 
       # Collection level, optional values: level_none, level0, level1, level2.
-      level: level1
+      level: level0
 
       # Whether to enable memory analysis.
       with_memory: False

@@ -112,7 +112,7 @@ actor_rollout_ref:
         npu:
           _target_: verl.utils.profiler.config.NPUToolConfig
           contents: []
-          level: level1
+          level: level0
           analysis: true
           discrete: false
         torch:
@@ -151,7 +151,7 @@ actor_rollout_ref:
         npu:
           _target_: verl.utils.profiler.config.NPUToolConfig
           contents: []
-          level: level1
+          level: level0
           analysis: true
           discrete: false
         torch:
@@ -505,7 +505,7 @@ critic:
       npu:
         _target_: verl.utils.profiler.config.NPUToolConfig
         contents: []
-        level: level1
+        level: level0
         analysis: true
         discrete: false
       torch:

@@ -99,7 +99,7 @@ actor_rollout_ref:
         npu:
           _target_: verl.utils.profiler.config.NPUToolConfig
           contents: []
-          level: level1
+          level: level0
           analysis: true
           discrete: false
         torch:
@@ -142,7 +142,7 @@ actor_rollout_ref:
         npu:
           _target_: verl.utils.profiler.config.NPUToolConfig
           contents: []
-          level: level1
+          level: level0
           analysis: true
           discrete: false
         torch:
@@ -439,7 +439,7 @@ critic:
       npu:
         _target_: verl.utils.profiler.config.NPUToolConfig
         contents: []
-        level: level1
+        level: level0
         analysis: true
         discrete: false
       torch:

@@ -190,7 +190,7 @@ profiler:
       contents: []
 
       # Collection level, optional values: level_none, level0, level1, level2.
-      level: "level1"
+      level: "level0"
 
       # Whether to automatically parse the data.
       analysis: True

@@ -143,7 +143,7 @@ profiler:
       contents: []
 
       # Collection level, optional values: level_none, level0, level1, level2.
-      level: "level1"
+      level: "level0"
 
       # Whether to automatically parse the data.
       analysis: True

@@ -10,7 +10,7 @@ options:
   roles: ["all"]
 
   # Collection level, optional values: level_none, level0, level1, level2.
-  level: level1
+  level: level0
 
   # Whether to enable memory analysis.
   with_memory: False

@@ -67,7 +67,7 @@ profiler:
       contents: []
 
       # Collection level, optional values: level_none, level0, level1, level2.
-      level: "level1"
+      level: "level0"
 
       # Whether to automatically parse the data.
       analysis: True

diff --git a/verl/utils/profiler/config.py b/verl/utils/profiler/config.py
@@ -82,7 +82,7 @@ class NPUToolConfig(NsightToolConfig):
     contents: list[str] = field(default_factory=list)
 
     # Collection level, optional values: level_none, level0, level1, level2.
-    level: str = "level1"
+    level: str = "level0"
 
     # Whether to automatically parse the data.
     analysis: bool = False

diff --git a/verl/utils/profiler/mstx_profile.py b/verl/utils/profiler/mstx_profile.py
@@ -20,6 +20,7 @@
 from typing import Any, Callable, Optional
 
 import torch_npu
+from packaging import version
 from torch_npu.npu import mstx
 
 from .config import NPUToolConfig
@@ -128,12 +129,16 @@ def get_npu_profiler(
     if role:
         profile_save_path = os.path.join(profile_save_path, role)
 
+    # The ability to filter communication via mstx_domain_exclude requires torch_npu==2.1 or higher.
+    if version.parse(torch_npu.__version__) < version.parse("2.1"):
+        raise RuntimeError("torch_npu==2.1 or higher is required to use mstx_domain_exclude")
+
     experimental_config = torch_npu.profiler._ExperimentalConfig(
-        aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
         profiler_level=level,
-        export_type=torch_npu.profiler.ExportType.Text,
+        export_type=torch_npu.profiler.ExportType.Db,
         data_simplification=True,
         msprof_tx=True,
+        mstx_domain_exclude=["communication"],
     )
 
     activites = []