From ada72d4ee239168d9fde2369d7d33664b6c40120 Mon Sep 17 00:00:00 2001 From: alcanerian Date: Tue, 22 Apr 2025 12:49:02 +0000 Subject: [PATCH 1/2] [fix] mem_fraction_static for deepseekv3 dp8 on h200 --- python/sglang/srt/server_args.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index b35dd93213..2df8a3a084 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -223,20 +223,23 @@ def __post_init__(self): # Set mem fraction static, which depends on the tensor parallelism size if self.mem_fraction_static is None: - if gpu_mem <= 81920: - if self.tp_size >= 16: - self.mem_fraction_static = 0.79 - elif self.tp_size >= 8: - self.mem_fraction_static = 0.81 - elif self.tp_size >= 4: - self.mem_fraction_static = 0.85 - elif self.tp_size >= 2: - self.mem_fraction_static = 0.87 - else: - self.mem_fraction_static = 0.88 + if self.tp_size >= 16: + self.mem_fraction_static = 0.79 + elif self.tp_size >= 8: + self.mem_fraction_static = 0.81 + elif self.tp_size >= 4: + self.mem_fraction_static = 0.85 + elif self.tp_size >= 2: + self.mem_fraction_static = 0.87 else: + self.mem_fraction_static = 0.88 + if gpu_mem > 81920: # FIXME: more fine grained auto-selection polices - self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem + mem_fraction = self.mem_fraction_static + self.mem_fraction_static = min( + mem_fraction + 40960 * (1 - mem_fraction) / gpu_mem, + (gpu_mem - 1024 * 13) / gpu_mem, + ) # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: From 5a80fa9e6f0b1cc0afcba0028d17ed09eab7abbd Mon Sep 17 00:00:00 2001 From: alcanerian Date: Tue, 22 Apr 2025 16:45:14 +0000 Subject: [PATCH 2/2] [fix] reduce dp capture bs --- .../srt/model_executor/cuda_graph_runner.py | 3 ++- python/sglang/srt/server_args.py | 27 +++++++++---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 92cf0388e2..8d3f63e156 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -134,7 +134,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): ) gpu_mem = get_device_memory_capacity() - if gpu_mem is not None and gpu_mem > 81920: + # Batch size of each rank will not become so large when DP is on + if gpu_mem is not None and gpu_mem > 81920 and server_args.dp_size == 1: capture_bs += list(range(160, 257, 8)) if max(capture_bs) > model_runner.req_to_token_pool.size: diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 2df8a3a084..b35dd93213 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -223,23 +223,20 @@ def __post_init__(self): # Set mem fraction static, which depends on the tensor parallelism size if self.mem_fraction_static is None: - if self.tp_size >= 16: - self.mem_fraction_static = 0.79 - elif self.tp_size >= 8: - self.mem_fraction_static = 0.81 - elif self.tp_size >= 4: - self.mem_fraction_static = 0.85 - elif self.tp_size >= 2: - self.mem_fraction_static = 0.87 + if gpu_mem <= 81920: + if self.tp_size >= 16: + self.mem_fraction_static = 0.79 + elif self.tp_size >= 8: + self.mem_fraction_static = 0.81 + elif self.tp_size >= 4: + self.mem_fraction_static = 0.85 + elif self.tp_size >= 2: + self.mem_fraction_static = 0.87 + else: + self.mem_fraction_static = 0.88 else: - self.mem_fraction_static = 0.88 - if gpu_mem > 81920: # FIXME: more fine grained auto-selection polices - mem_fraction = self.mem_fraction_static - self.mem_fraction_static = min( - mem_fraction + 40960 * (1 - mem_fraction) / gpu_mem, - (gpu_mem - 1024 * 13) / gpu_mem, - ) + self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem # Set chunked prefill size, which depends on the gpu memory capacity if self.chunked_prefill_size is None: