add new matric and fix repeated png

zhulinJulia24 · zhulinJulia24 · commit 9f4d171a6e78 · 2026-06-11T16:41:14.000+08:00
diff --git a/autotest/config-npu.yaml b/autotest/config-npu.yaml
@@ -56,6 +56,7 @@ case:
     npu-qwen3-sft-ep8:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/npu_qwen3_moe_30BA3_ep8.py
                 output_path: /mnt/hwfile/llmrazor/qa-llm-cicd/test_output
@@ -80,6 +81,7 @@ case:
             timeout: 10800
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/hwfile/llmrazor/qa-llm-cicd/test_output npu-qwen3-sft-ep8 sft'
             parameters:
diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -54,6 +54,7 @@ case:
     qwen3-sft-ep8:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/qwen3_moe_30BA3_ep8.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -79,6 +80,7 @@ case:
             timeout: 1500
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-sft-ep8 sft'
             parameters:
@@ -475,6 +477,7 @@ case:
     qwen3-5-sft-sp4-resume:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/qwen3_5_moe_30BA3_sp4.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -499,6 +502,7 @@ case:
 
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume sft'
             parameters:
@@ -608,6 +612,7 @@ case:
     qwen3-5-sft-sp4-resume-vl:
         -
             type: sft
+            phase: first
             parameters:
                 config: autotest/config/qwen3_5_moe_30BA3_sp4_vl.py
                 output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
@@ -634,6 +639,7 @@ case:
 
         -
             type: sft
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume-vl sft'
             parameters:
@@ -891,6 +897,7 @@ case:
     qwen3-5-rl-vl-lmdeploy-resume:
         -
             type: rl
+            phase: first
             parameters:
                 config: autotest/config/rl_qwen3p5_vl_35B_dapo_ep2_resume.py
                 infer_backend: lmdeploy
@@ -935,6 +942,7 @@ case:
 
         -
             type: rl
+            phase: resume
             pre_action:
                 command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-rl-vl-lmdeploy-resume rl'
             parameters:
diff --git a/autotest/module/train.py b/autotest/module/train.py
@@ -1,16 +1,22 @@
+import json
 import os
+import shutil
+from typing import Any
 
 from utils.check_metric import check_result, check_rl_result
 from utils.run_cmd import run_cmd
 
 
+FIRST_RUN_TRACKER_SNAPSHOT = "_first_run_tracker.jsonl"
+
+
 class Train:
     def get_cmd(config):
         print(config)
         config_path = config.get("parameters").get("config")
         train_type = config.get("type")
         nproc_per_node = config.get("resource", {}).get("gpus_per_task", 8)
-        pip_package = config.get("resource", {}).get("pip_package", 'ls')
+        pip_package = config.get("resource", {}).get("pip_package", "ls")
         if train_type in ["sft", "rl"]:
             model_config = config.get("parameters", {}).get("model", None)
             config_path = config.get("parameters", {}).get("config", None)
@@ -70,22 +76,28 @@ def get_cmd(config):
 
     def validate(config):
         work_dir = config.get("work_dir", None)
-        base_path = os.path.join(
-            config.get("base_path").get("base_baseline_path"), config.get("assert_info", {}).get("base_metric", None)
-        )
+        base_metric = config.get("assert_info", {}).get("base_metric", None)
+        base_path = os.path.join(config.get("base_path").get("base_baseline_path"), base_metric)
         train_type = config.get("type")
+        case_name = config["case_name"]
+        phase = config.get("phase")
+        context = config.get("context", {})
+
+        cur_path = resolve_tracker_path(work_dir, train_type, phase, context=context)
+
         if train_type == "sft":
-            cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/rank0/tracker.jsonl")
             check_metrics = config.get("assert_info", {}).get("check_metrics", {})
-            return check_result(config["case_name"], base_path, cur_path, check_metrics)
+            result = check_result(case_name, base_path, cur_path, check_metrics, phase=phase)
         elif train_type == "rl":
-            cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/tracker.jsonl")
             check_metrics = config.get("assert_info", {})
-            return check_rl_result(config["case_name"], base_path, cur_path, check_metrics)
+            result = check_rl_result(case_name, base_path, cur_path, check_metrics, phase=phase)
         else:
             print("Unknown type: {train_type}")
             return False
 
+        snapshot_first_run_tracker(work_dir, phase, cur_path, context=context)
+        return result
+
     def pre_action(config=None):
         action_info = config.get("pre_action", None)
         if action_info:
@@ -101,12 +113,92 @@ def post_action(config=None):
                 run_cmd(action_cmd)
 
 
-def get_latest_subdir(work_dir):
-    dirs = [
-        d for d in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, d)) and len(d) == 14 and d.isdigit()
-    ]
+def list_timestamp_subdirs(work_dir: str) -> list[str]:
+    return sorted(
+        name
+        for name in os.listdir(work_dir)
+        if os.path.isdir(os.path.join(work_dir, name)) and len(name) == 14 and name.isdigit()
+    )
+
+
+def _tracker_relpath(train_type: str) -> str:
+    if train_type == "sft":
+        return "logs/exp_tracking/rank0/tracker.jsonl"
+    return "logs/exp_tracking/tracker.jsonl"
+
+
+def _tracker_path(exp_dir: str | None, train_type: str) -> str:
+    return os.path.join(exp_dir, _tracker_relpath(train_type))
+
+
+def _snapshot_path(work_dir: str) -> str:
+    return os.path.join(work_dir, FIRST_RUN_TRACKER_SNAPSHOT)
+
+
+def _write_first_run_segment(src: str, dst: str) -> None:
+    os.makedirs(os.path.dirname(dst), exist_ok=True)
+    seen_steps: set[Any] = set()
+    with open(src, encoding="utf-8") as fin, open(dst, "w", encoding="utf-8") as fout:
+        for line in fin:
+            if not line.strip():
+                continue
+            step = json.loads(line).get("step")
+            if step in seen_steps:
+                break
+            seen_steps.add(step)
+            fout.write(line if line.endswith("\n") else f"{line}\n")
+
+
+def _has_duplicate_steps(tracker_path: str) -> bool:
+    steps: list[Any] = []
+    with open(tracker_path, encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                steps.append(json.loads(line).get("step"))
+    return len(steps) != len(set(steps))
+
+
+def resolve_tracker_path(
+    work_dir: str,
+    train_type: str,
+    phase: str | None,
+    context: dict[str, Any] | None = None,
+) -> str:
+    context = context or {}
+    snapshot = context.get("first_run_tracker") or _snapshot_path(work_dir)
+
+    if phase == "first":
+        if os.path.isfile(snapshot):
+            return snapshot
+
+        subdirs = list_timestamp_subdirs(work_dir)
+        if len(subdirs) > 1:
+            exp_dir = os.path.join(work_dir, subdirs[0])
+        else:
+            exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None
+        live_tracker = _tracker_path(exp_dir, train_type)
+
+        if os.path.isfile(live_tracker) and _has_duplicate_steps(live_tracker):
+            _write_first_run_segment(live_tracker, snapshot)
+            if os.path.isfile(snapshot) and os.path.getsize(snapshot) > 0:
+                return snapshot
+        return live_tracker
+
+    subdirs = list_timestamp_subdirs(work_dir)
+    exp_dir = os.path.join(work_dir, subdirs[-1]) if subdirs else None
+    return _tracker_path(exp_dir, train_type)
+
 
-    if not dirs:
-        return None
-    latest = max(dirs, key=lambda d: os.path.getmtime(os.path.join(work_dir, d)))
-    return os.path.join(work_dir, latest)
+def snapshot_first_run_tracker(
+    work_dir: str,
+    phase: str | None,
+    cur_path: str,
+    context: dict[str, Any] | None = None,
+) -> None:
+    if phase != "first" or not os.path.isfile(cur_path):
+        return
+    snapshot = _snapshot_path(work_dir)
+    if cur_path != snapshot:
+        shutil.copy2(cur_path, snapshot)
+    if context is not None:
+        context["first_run_tracker"] = snapshot
diff --git a/autotest/utils/check_metric.py b/autotest/utils/check_metric.py
@@ -10,6 +10,13 @@
 )
 logger = logging.getLogger(__name__)
 
+MEMORY_GRADIENT_WARMUP_STEPS = 5
+MEMORY_GRADIENT_MIN_SEGMENT_LEN = 8
+MEMORY_GRADIENT_POSITIVE_RATIO = 0.65
+MEMORY_GRADIENT_MIN_SLOPE_GB = 1e-4
+MEMORY_GRADIENT_MIN_REL_DRIFT = 0.00015
+MEMORY_GRADIENT_RESUME_DROP_GB = 0.005
+
 
 def extract_value(file, metrics):
     metric_all = {metric: [] for metric in metrics}
@@ -25,7 +32,57 @@ def extract_value(file, metrics):
     return total_step, metric_all
 
 
-def check_result(case_name, base_path, cur_path, check_metric):
+def _split_memory_segments(values: np.ndarray) -> list[np.ndarray]:
+    if len(values) < MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+        return [values]
+
+    segments: list[np.ndarray] = []
+    start = 0
+    for idx in range(1, len(values)):
+        dropped = values[idx - 1] - values[idx]
+        if dropped >= MEMORY_GRADIENT_RESUME_DROP_GB:
+            if idx - start >= MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+                segments.append(values[start:idx])
+            start = idx
+    if len(values) - start >= MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+        segments.append(values[start:])
+    return segments or [values]
+
+
+def detect_memory_upward_gradient(values: list[float]) -> tuple[bool, str]:
+    """Detect sustained upward memory drift (possible leak) in the current
+    run."""
+    if len(values) <= MEMORY_GRADIENT_WARMUP_STEPS + MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+        return False, ""
+
+    series = np.asarray(values[MEMORY_GRADIENT_WARMUP_STEPS:], dtype=float)
+
+    for seg_idx, segment in enumerate(_split_memory_segments(series)):
+        if len(segment) < MEMORY_GRADIENT_MIN_SEGMENT_LEN:
+            continue
+
+        deltas = np.diff(segment)
+        positive_ratio = float(np.mean(deltas > 1e-4))
+        x = np.arange(len(segment))
+        slope, _ = np.polyfit(x, segment, 1)
+        mean_val = float(np.mean(segment))
+        if mean_val < 1e-10:
+            continue
+
+        relative_drift = float(slope * (len(segment) - 1) / mean_val)
+        slope_rising = slope > MEMORY_GRADIENT_MIN_SLOPE_GB
+        mostly_increasing = positive_ratio >= MEMORY_GRADIENT_POSITIVE_RATIO
+        drift_too_large = relative_drift > MEMORY_GRADIENT_MIN_REL_DRIFT
+
+        if slope_rising and mostly_increasing and drift_too_large:
+            return True, (
+                f"segment {seg_idx}: slope={slope:.6f} GB/step, "
+                f"relative_drift={relative_drift:.4f}, positive_ratio={positive_ratio:.2f}"
+            )
+    return False, ""
+
+
+def check_result(case_name, base_path, cur_path, check_metric, phase=None):
     fail_metric = {}
     metric_list = list(check_metric.keys())
     base_steps, base_metrics = extract_value(base_path, metric_list)
@@ -34,28 +91,57 @@ def check_result(case_name, base_path, cur_path, check_metric):
         f"current steps is not equal to base steps, current steps: {cur_steps}, base steps: {base_steps}"
     )
 
-    publish_comparison_report(case_name, check_metric, base_metrics, cur_metrics, base_path, cur_path)
+    publish_comparison_report(case_name, check_metric, base_metrics, cur_metrics, base_path, cur_path, phase=phase)
 
     for metric, threshold in check_metric.items():
         max_error = 0.0
         max_error_idx = 0
         check_flag = True
         if metric == "runtime_info/tgs":
             if cur_steps > 10:
-                relative_errors = abs(np.array(base_metrics[metric][10:-1]) - np.array(cur_metrics[metric][10:-1])) / (
-                    np.array(base_metrics[metric][10:-1])
+                base_vals = np.array(base_metrics[metric][10:-1], dtype=float)
+                cur_vals = np.array(cur_metrics[metric][10:-1], dtype=float)
+                degradation = np.zeros_like(base_vals, dtype=float)
+                valid_base = np.abs(base_vals) >= 1e-10
+                degradation[valid_base] = np.maximum(
+                    (base_vals[valid_base] - cur_vals[valid_base]) / np.abs(base_vals[valid_base]),
+                    0.0,
                 )
-                max_error = np.percentile(relative_errors, 80)
+                max_error = float(np.percentile(degradation, 80))
                 if max_error > threshold:
                     fail_metric[metric] = (
-                        f"{metric} relative error bigger than {threshold} after 10 step, baseline: {base_metrics[metric][10:-1]}, now: {cur_metrics[metric][10:-1]}, relative error: {relative_errors}"
+                        f"{metric} degradation bigger than {threshold} after step 10, "
+                        f"baseline: {base_metrics[metric][10:-1]}, now: {cur_metrics[metric][10:-1]}, "
+                        f"degradation: {degradation.tolist()}"
                     )
                     check_flag = False
                 else:
                     check_flag = True
             else:
                 logger.warning("It's meaningless to compare tgs because of the small steps.")
                 check_flag = False
+        elif metric == "memory/max_memory_GB":
+            for idx, (old, cur) in enumerate(zip(base_metrics[metric], cur_metrics[metric])):
+                if abs(old) < 1e-10:
+                    relative_error = float("inf") if abs(cur) > 1e-10 else 0.0
+                else:
+                    relative_error = round(abs(old - cur) / abs(old), 2)
+                if relative_error > max_error:
+                    max_error = relative_error
+                    max_error_idx = idx
+                if relative_error > threshold:
+                    fail_metric[metric] = (
+                        f"{metric} relative error bigger than {threshold} in {idx} steps, "
+                        f"baseline: {old:.6f}, now: {cur:.6f}, relative error: {relative_error}"
+                    )
+                    check_flag = False
+                    break
+
+            if check_flag:
+                has_gradient, gradient_info = detect_memory_upward_gradient(cur_metrics[metric])
+                if has_gradient:
+                    fail_metric[metric] = f"{metric} shows sustained upward gradient in current run, {gradient_info}"
+                    check_flag = False
         else:
             for idx, (old, cur) in enumerate(zip(base_metrics[metric], cur_metrics[metric])):
                 if abs(old) < 1e-10:
@@ -82,7 +168,7 @@ def check_result(case_name, base_path, cur_path, check_metric):
     return result, f"Some metric check failed: {fail_metric}"
 
 
-def check_rl_result(case_name, base_path, cur_path, assert_info):
+def check_rl_result(case_name, base_path, cur_path, assert_info, phase=None):
     fail_metric = {}
     check_metrics_list = assert_info["check_metrics"]
 
@@ -96,7 +182,9 @@ def check_rl_result(case_name, base_path, cur_path, assert_info):
     )
 
     check_metric_dict = {item["metric"]: item["threshold"] for item in check_metrics_list}
-    publish_comparison_report(case_name, check_metric_dict, base_metrics, cur_metrics, base_path, cur_path)
+    publish_comparison_report(
+        case_name, check_metric_dict, base_metrics, cur_metrics, base_path, cur_path, phase=phase
+    )
 
     for config in check_metrics_list:
         metric = config["metric"]
diff --git a/autotest/utils/metric_report.py b/autotest/utils/metric_report.py