diff --git a/docs/ascend_tutorial/ascend_profiling_en.rst b/docs/ascend_tutorial/ascend_profiling_en.rst index bcd089e21dd..3121125cbd0 100644 --- a/docs/ascend_tutorial/ascend_profiling_en.rst +++ b/docs/ascend_tutorial/ascend_profiling_en.rst @@ -46,14 +46,10 @@ Use parameters in each role's ``profiler.tool_config.npu`` to control npu profil - level: Collection level—options are level_none, level0, level1, and level2 - - level_none: Disables all level-based data collection (turns off - profiler_level). - - level0: Collect high-level application data, underlying NPU data, - and operator execution details on NPU. - - level1: Extends level0 by adding CANN-layer AscendCL data and AI - Core performance metrics on NPU. - - level2: Extends level1 by adding CANN-layer Runtime data and AI - CPU metrics. + - level_none: Disables all level-based data collection (turns off profiler_level). + - level0: Collect high-level application data, underlying NPU data, and operator execution details on NPU. After balancing data volume and analytical capability, Level 0 is recommended as the default configuration. + - level1: Extends level0 by adding CANN-layer AscendCL data and AI Core performance metrics on NPU. + - level2: Extends level1 by adding CANN-layer Runtime data and AI CPU metrics. - contents: A list of options to control the collection content, such as npu, cpu, memory, shapes, module, stack. @@ -62,8 +58,7 @@ Use parameters in each role's ``profiler.tool_config.npu`` to control npu profil - cpu: Whether to collect host-side performance data. - memory: Whether to enable memory analysis. - shapes: Whether to record tensor shapes. - - module: Whether to record framework-layer Python call stack - information. + - module: Whether to record framework-layer Python call stack information. It is recommended to use 'module' instead of 'stack' for recording call stack information, as it costs less performance overhead. - stack: Whether to record operator call stack information. - analysis: Enables automatic data parsing. diff --git a/docs/ascend_tutorial/ascend_profiling_zh.rst b/docs/ascend_tutorial/ascend_profiling_zh.rst index 00e8565a7e1..f85d12bbae8 100644 --- a/docs/ascend_tutorial/ascend_profiling_zh.rst +++ b/docs/ascend_tutorial/ascend_profiling_zh.rst @@ -41,7 +41,7 @@ Last updated: 08/14/2025. - level: 采集级别—选项有 level_none、level0、level1 和 level2 - level_none: 禁用所有基于级别的数据采集(关闭 profiler_level)。 - - level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。 + - level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。在权衡数据量和分析能力后,level0是推荐的默认配置。 - level1: 在level0基础上增加CANN层AscendCL数据和NPU上的AI Core性能指标。 - level2: 在level1基础上增加CANN层Runtime数据和AI CPU指标。 @@ -52,7 +52,7 @@ Last updated: 08/14/2025. - cpu: 是否采集主机端性能数据。 - memory: 是否启用内存分析。 - shapes: 是否记录张量形状。 - - module: 是否记录框架层Python调用栈信息。 + - module: 是否记录框架层Python调用栈信息。相较于stack,更推荐使用module记录调用栈信息,因其产生的性能膨胀更低。 - stack: 是否记录算子调用栈信息。 - analysis: 启用自动数据解析。 diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh index 020915c47ce..3a2d523f26d 100644 --- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh +++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh @@ -8,7 +8,7 @@ PROFILE_RANKS="[1,2]" # profiling NPU options SAVE_PATH="$HOME/profile_data" -LEVEL="level1" +LEVEL="level0" CONTENTS=['npu','cpu'] ANALYSIS=True diff --git a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh index b7c806762a7..963e75a6343 100644 --- a/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh +++ b/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh @@ -7,7 +7,7 @@ DISCRETE=False # profiling NPU options SAVE_PATH="$HOME/profile_data" -LEVEL="level1" +LEVEL="level0" CONTENTS=['npu','cpu'] ANALYSIS=True diff --git a/recipe/gkd/config/on_policy_distill_trainer.yaml b/recipe/gkd/config/on_policy_distill_trainer.yaml index 120c3b7a38b..5227785b1e4 100644 --- a/recipe/gkd/config/on_policy_distill_trainer.yaml +++ b/recipe/gkd/config/on_policy_distill_trainer.yaml @@ -158,7 +158,7 @@ actor_rollout_ref: contents: [] # Collection level, optional values: level_none, level0, level1, level2. - level: "level1" + level: "level0" # Whether to automatically parse the data. analysis: True diff --git a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml index 06e2e94a662..3dd0b8a38d6 100644 --- a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml +++ b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml @@ -456,7 +456,7 @@ trainer: options: save_path: ./profiler_data roles: ["all"] - level: level1 + level: level0 with_memory: False record_shapes: False with_npu: True diff --git a/tests/trainer/config/legacy_ppo_trainer.yaml b/tests/trainer/config/legacy_ppo_trainer.yaml index c09e06e978d..25919bd15d9 100644 --- a/tests/trainer/config/legacy_ppo_trainer.yaml +++ b/tests/trainer/config/legacy_ppo_trainer.yaml @@ -1015,7 +1015,7 @@ trainer: roles: ["all"] # Collection level, optional values: level_none, level0, level1, level2. - level: level1 + level: level0 # Whether to enable memory analysis. with_memory: False diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml index a117c0f332f..d6260ed6b19 100644 --- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml @@ -112,7 +112,7 @@ actor_rollout_ref: npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] - level: level1 + level: level0 analysis: true discrete: false torch: @@ -151,7 +151,7 @@ actor_rollout_ref: npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] - level: level1 + level: level0 analysis: true discrete: false torch: @@ -505,7 +505,7 @@ critic: npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] - level: level1 + level: level0 analysis: true discrete: false torch: diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml index 833ebb70d5b..da79bfb943a 100644 --- a/verl/trainer/config/_generated_ppo_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_trainer.yaml @@ -99,7 +99,7 @@ actor_rollout_ref: npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] - level: level1 + level: level0 analysis: true discrete: false torch: @@ -142,7 +142,7 @@ actor_rollout_ref: npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] - level: level1 + level: level0 analysis: true discrete: false torch: @@ -439,7 +439,7 @@ critic: npu: _target_: verl.utils.profiler.config.NPUToolConfig contents: [] - level: level1 + level: level0 analysis: true discrete: false torch: diff --git a/verl/trainer/config/actor/actor.yaml b/verl/trainer/config/actor/actor.yaml index 283095a1527..1c8dae7c5dd 100644 --- a/verl/trainer/config/actor/actor.yaml +++ b/verl/trainer/config/actor/actor.yaml @@ -190,7 +190,7 @@ profiler: contents: [] # Collection level, optional values: level_none, level0, level1, level2. - level: "level1" + level: "level0" # Whether to automatically parse the data. analysis: True diff --git a/verl/trainer/config/critic/critic.yaml b/verl/trainer/config/critic/critic.yaml index 95cbeaf92bc..afa78576444 100644 --- a/verl/trainer/config/critic/critic.yaml +++ b/verl/trainer/config/critic/critic.yaml @@ -143,7 +143,7 @@ profiler: contents: [] # Collection level, optional values: level_none, level0, level1, level2. - level: "level1" + level: "level0" # Whether to automatically parse the data. analysis: True diff --git a/verl/trainer/config/npu_profile/npu_profile.yaml b/verl/trainer/config/npu_profile/npu_profile.yaml index 52bb52d3f40..bb34dc7cf59 100644 --- a/verl/trainer/config/npu_profile/npu_profile.yaml +++ b/verl/trainer/config/npu_profile/npu_profile.yaml @@ -10,7 +10,7 @@ options: roles: ["all"] # Collection level, optional values: level_none, level0, level1, level2. - level: level1 + level: level0 # Whether to enable memory analysis. with_memory: False diff --git a/verl/trainer/config/ref/ref.yaml b/verl/trainer/config/ref/ref.yaml index ec566c25b9a..eef41625cba 100644 --- a/verl/trainer/config/ref/ref.yaml +++ b/verl/trainer/config/ref/ref.yaml @@ -67,7 +67,7 @@ profiler: contents: [] # Collection level, optional values: level_none, level0, level1, level2. - level: "level1" + level: "level0" # Whether to automatically parse the data. analysis: True diff --git a/verl/utils/profiler/config.py b/verl/utils/profiler/config.py index 33bd3f24251..e1ade336972 100644 --- a/verl/utils/profiler/config.py +++ b/verl/utils/profiler/config.py @@ -82,7 +82,7 @@ class NPUToolConfig(NsightToolConfig): contents: list[str] = field(default_factory=list) # Collection level, optional values: level_none, level0, level1, level2. - level: str = "level1" + level: str = "level0" # Whether to automatically parse the data. analysis: bool = False diff --git a/verl/utils/profiler/mstx_profile.py b/verl/utils/profiler/mstx_profile.py index b9576714248..4868ef2af36 100644 --- a/verl/utils/profiler/mstx_profile.py +++ b/verl/utils/profiler/mstx_profile.py @@ -20,6 +20,7 @@ from typing import Any, Callable, Optional import torch_npu +from packaging import version from torch_npu.npu import mstx from .config import NPUToolConfig @@ -128,12 +129,16 @@ def get_npu_profiler( if role: profile_save_path = os.path.join(profile_save_path, role) + # The ability to filter communication via mstx_domain_exclude requires torch_npu==2.1 or higher. + if version.parse(torch_npu.__version__) < version.parse("2.1"): + raise RuntimeError("torch_npu==2.1 or higher is required to use mstx_domain_exclude") + experimental_config = torch_npu.profiler._ExperimentalConfig( - aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization, profiler_level=level, - export_type=torch_npu.profiler.ExportType.Text, + export_type=torch_npu.profiler.ExportType.Db, data_simplification=True, msprof_tx=True, + mstx_domain_exclude=["communication"], ) activites = []