Skip to content

Commit 5b614fc

Browse files
hebiao064tarinkk
authored andcommitted
Optimize a pad operation to accelerate 25us (sgl-project#5945)
1 parent 674adb7 commit 5b614fc

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

python/sglang/srt/layers/attention/flashattention_backend.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,8 +1617,9 @@ def init_forward_metadata_replay_cuda_graph(
16171617
metadata.max_seq_len_k = max_len
16181618

16191619
metadata.cache_seqlens_int32 = seq_lens.to(torch.int32)
1620-
metadata.cu_seqlens_k = torch.nn.functional.pad(
1621-
torch.cumsum(seq_lens, dim=0, dtype=torch.int32), (1, 0)
1620+
# Optimize cumulative sequence length calculation
1621+
metadata.cu_seqlens_k[1:].copy_(
1622+
torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
16221623
)
16231624

16241625
max_seq_pages = (

0 commit comments

Comments
 (0)