From ada72d4ee239168d9fde2369d7d33664b6c40120 Mon Sep 17 00:00:00 2001
From: alcanerian <alcanerian@gmail.com>
Date: Tue, 22 Apr 2025 12:49:02 +0000
Subject: [PATCH 1/2] [fix] mem_fraction_static for deepseekv3 dp8 on h200

---
 python/sglang/srt/server_args.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index b35dd93213..2df8a3a084 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -223,20 +223,23 @@ def __post_init__(self):
 
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
-            if gpu_mem <= 81920:
-                if self.tp_size >= 16:
-                    self.mem_fraction_static = 0.79
-                elif self.tp_size >= 8:
-                    self.mem_fraction_static = 0.81
-                elif self.tp_size >= 4:
-                    self.mem_fraction_static = 0.85
-                elif self.tp_size >= 2:
-                    self.mem_fraction_static = 0.87
-                else:
-                    self.mem_fraction_static = 0.88
+            if self.tp_size >= 16:
+                self.mem_fraction_static = 0.79
+            elif self.tp_size >= 8:
+                self.mem_fraction_static = 0.81
+            elif self.tp_size >= 4:
+                self.mem_fraction_static = 0.85
+            elif self.tp_size >= 2:
+                self.mem_fraction_static = 0.87
             else:
+                self.mem_fraction_static = 0.88
+            if gpu_mem > 81920:
                 # FIXME: more fine grained auto-selection polices
-                self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
+                mem_fraction = self.mem_fraction_static
+                self.mem_fraction_static = min(
+                    mem_fraction + 40960 * (1 - mem_fraction) / gpu_mem,
+                    (gpu_mem - 1024 * 13) / gpu_mem,
+                )
 
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None:

From 5a80fa9e6f0b1cc0afcba0028d17ed09eab7abbd Mon Sep 17 00:00:00 2001
From: alcanerian <alcanerian@gmail.com>
Date: Tue, 22 Apr 2025 16:45:14 +0000
Subject: [PATCH 2/2] [fix] reduce dp capture bs

---
 .../srt/model_executor/cuda_graph_runner.py   |  3 ++-
 python/sglang/srt/server_args.py              | 27 +++++++++----------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index 92cf0388e2..8d3f63e156 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -134,7 +134,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
             )
 
         gpu_mem = get_device_memory_capacity()
-        if gpu_mem is not None and gpu_mem > 81920:
+        # Batch size of each rank will not become so large when DP is on
+        if gpu_mem is not None and gpu_mem > 81920 and server_args.dp_size == 1:
             capture_bs += list(range(160, 257, 8))
 
     if max(capture_bs) > model_runner.req_to_token_pool.size:
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 2df8a3a084..b35dd93213 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -223,23 +223,20 @@ def __post_init__(self):
 
         # Set mem fraction static, which depends on the tensor parallelism size
         if self.mem_fraction_static is None:
-            if self.tp_size >= 16:
-                self.mem_fraction_static = 0.79
-            elif self.tp_size >= 8:
-                self.mem_fraction_static = 0.81
-            elif self.tp_size >= 4:
-                self.mem_fraction_static = 0.85
-            elif self.tp_size >= 2:
-                self.mem_fraction_static = 0.87
+            if gpu_mem <= 81920:
+                if self.tp_size >= 16:
+                    self.mem_fraction_static = 0.79
+                elif self.tp_size >= 8:
+                    self.mem_fraction_static = 0.81
+                elif self.tp_size >= 4:
+                    self.mem_fraction_static = 0.85
+                elif self.tp_size >= 2:
+                    self.mem_fraction_static = 0.87
+                else:
+                    self.mem_fraction_static = 0.88
             else:
-                self.mem_fraction_static = 0.88
-            if gpu_mem > 81920:
                 # FIXME: more fine grained auto-selection polices
-                mem_fraction = self.mem_fraction_static
-                self.mem_fraction_static = min(
-                    mem_fraction + 40960 * (1 - mem_fraction) / gpu_mem,
-                    (gpu_mem - 1024 * 13) / gpu_mem,
-                )
+                self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
 
         # Set chunked prefill size, which depends on the gpu memory capacity
         if self.chunked_prefill_size is None: