Fix conflicts and lints

Fridge003 · Fridge003 · commit 5912af58edec · 2025-04-28T22:27:10.000-07:00
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -105,24 +105,6 @@ jobs:
           cd test/srt
           python3 run_suite.py --suite per-commit-8-gpu
 
-  performance-test-1-gpu-part-1:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
-        github.event.pull_request.draft == false
-    runs-on: 8-gpu-runner
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install dependencies
-        run: |
-          bash scripts/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 40
-        run: |
-          cd test/srt
-          python3 run_suite.py --suite per-commit-8-gpu
-
   performance-test-1-gpu-part-1:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false
@@ -132,8 +114,6 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install dependencies
-        env:
-          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.5/flashinfer-python' || 'https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python' }}
         run: |
           bash scripts/ci_install_dependency.sh
 
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -15,6 +15,7 @@
 
 import torch
 import torch._dynamo
+
 torch._dynamo.config.suppress_errors = True
 
 
@@ -56,6 +57,7 @@ class PrefillMetadata:
     use_ragged: bool
     extend_no_prefix: bool
 
+
 # Reuse this workspace buffer across all flashinfer wrappers
 global_workspace_buffer = None
 
@@ -282,7 +284,7 @@ def init_cuda_graph_state(
             )
             self.cuda_graph_qk_indptr = [x.clone() for x in self.kv_indptr]
             self.cuda_graph_qo_indptr = [x.clone() for x in self.kv_indptr]
-            
+
             # Force allocation
             self.cuda_graph_custom_mask[0] = 0
             for i in range(len(self.cuda_graph_qk_indptr)):
@@ -291,7 +293,7 @@ def init_cuda_graph_state(
             for i in range(len(self.cuda_graph_qo_indptr)):
                 if len(self.cuda_graph_qo_indptr[i]) > 0:
                     self.cuda_graph_qo_indptr[i][0] = 0
-        
+
         # Force synchronization to ensure all tensors are allocated
         torch.cuda.synchronize()
 
@@ -508,11 +510,11 @@ def safe_forward_call(q, kv_cache):
                 k_scale=layer.k_scale,
                 v_scale=layer.v_scale,
             )
-        
+
         # Call the wrapped function
         o = safe_forward_call(
             q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
-            forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+            forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
         )
 
         return o.view(-1, layer.tp_q_head_num * layer.head_dim)
@@ -1185,7 +1187,7 @@ def fast_decode_plan(
     batch_size = len(last_page_len)
     if logits_soft_cap is None:
         logits_soft_cap = 0.0
-        
+
     # Handle data types consistently
     if data_type is not None:
         if q_data_type is None:
@@ -1194,7 +1196,7 @@ def fast_decode_plan(
             kv_data_type = data_type
     elif q_data_type is None:
         q_data_type = "float16"
-        
+
     if kv_data_type is None:
         kv_data_type = q_data_type
 
@@ -1218,19 +1220,19 @@ def fast_decode_plan(
         self._paged_kv_indices_buf = indices
         self._paged_kv_last_page_len_buf = last_page_len
         if self.use_tensor_cores:
-            self._qo_indptr_buf = qo_indptr_host.to(self.device, non_blocking=non_blocking)
+            self._qo_indptr_buf = qo_indptr_host.to(
+                self.device, non_blocking=non_blocking
+            )
 
     # Create empty tensors for dtype info if needed
     empty_q_data = torch.empty(
         0,
         dtype=(
-            getattr(torch, q_data_type)
-            if isinstance(q_data_type, str)
-            else q_data_type
+            getattr(torch, q_data_type) if isinstance(q_data_type, str) else q_data_type
         ),
         device=self.device,
     )
-    
+
     empty_kv_cache = torch.empty(
         0,
         dtype=(
@@ -1248,7 +1250,7 @@ def fast_decode_plan(
     )
 
     with torch.cuda.device(self.device):
-        
+
         if self.use_tensor_cores:
             # Convert indptr to CPU, as the authors intended
             if global_override_indptr_cpu is not None:
@@ -1259,10 +1261,8 @@ def fast_decode_plan(
             # ALSO convert last_page_len to CPU
             last_page_len_host = last_page_len.cpu()
 
-            kv_lens_arr_host = get_seq_lens(
-                indptr_host, last_page_len_host, page_size
-            )
-            
+            kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host, page_size)
+
             try:
                 # Make sure we pass exactly 15 arguments for tensor core version
                 self._plan_info = self._cached_module.plan(
@@ -1285,6 +1285,7 @@ def fast_decode_plan(
             except Exception as e:
                 # Log the error for debugging
                 import logging
+
                 logging.error(f"Error in tensor core plan: {e}")
                 raise
         else:
@@ -1310,6 +1311,7 @@ def fast_decode_plan(
             except Exception as e:
                 # Log the error for debugging
                 import logging
+
                 logging.error(f"Error in standard plan: {e}")
                 raise
 
@@ -1319,4 +1321,3 @@ def fast_decode_plan(
     self._sm_scale = sm_scale
     self._rope_scale = rope_scale
     self._rope_theta = rope_theta
-
diff --git a/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
@@ -14,8 +14,9 @@
 from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import torch
-import triton
 import torch._dynamo
+import triton
+
 torch._dynamo.config.suppress_errors = True
 
 from sglang.global_config import global_config
@@ -209,15 +210,15 @@ def init_cuda_graph_state(
         self.cuda_graph_kv_lens = torch.ones(
             (max_bs,), dtype=torch.int32, device=self.device
         )
-        
+
         # Force allocation by performing a small operation and synchronizing
         # This ensures all tensors are properly allocated in GPU memory
         self.cuda_graph_kv_indices[0] = 0
         self.cuda_graph_qo_indptr[0] = 0
         self.cuda_graph_kv_indptr[0] = 0
         self.cuda_graph_kv_lens[0] = 1
         torch.cuda.synchronize()
-        
+
         # For fast decode plan in graph replaying
         self.cuda_graph_qo_indptr_cpu = self.cuda_graph_qo_indptr.to("cpu")
         self.cuda_graph_kv_indptr_cpu = self.cuda_graph_kv_indptr.to("cpu")
@@ -399,11 +400,11 @@ def forward_decode(
                     k,
                     v,
                 )
-        
+
         # Reshape inputs
         reshaped_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
         k_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
-        
+
         # Direct call to run without the wrapper
         o = decode_wrapper.run(
             reshaped_q[:, :, : layer.v_head_dim],
@@ -855,8 +856,9 @@ def fast_mla_decode_plan(
         except Exception as e:
             # Log error for debugging
             import logging
+
             logging.error(f"Error in MLA plan: {e}")
-            
+
             # Try alternate version with more arguments if needed
             try:
                 self._cached_module.plan(
@@ -865,7 +867,7 @@ def fast_mla_decode_plan(
                     self._pin_memory_int_workspace_buffer,
                     qo_indptr_cpu,
                     kv_indptr_cpu,
-                    kv_indices, # Include kv_indices which was missing
+                    kv_indices,  # Include kv_indices which was missing
                     kv_len_arr_cpu,
                     num_heads,
                     head_dim_ckv,
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
@@ -2,9 +2,6 @@
 # Install the dependency in CI.
 set -euxo pipefail
 
-# Use repo from environment variables, passed from GitHub Actions
-FLASHINFER_REPO="${FLASHINFER_REPO:-https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python}"
-
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 bash "${SCRIPT_DIR}/killall_sglang.sh"
 
@@ -18,9 +15,8 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
 # Update pip
 pip install --upgrade pip
 
-# Install flashinfer and sgl-kernel
-pip install flashinfer_python==0.2.5 --find-links ${FLASHINFER_REPO} --no-cache-dir
-pip install sgl-kernel==0.0.9.post1 --no-cache-dir
+# Install sgl-kernel
+pip install sgl-kernel==0.1.0 --no-cache-dir
 
 # Install the main package
 pip install -e "python[all]"
diff --git a/sgl-kernel/build.sh b/sgl-kernel/build.sh
@@ -10,7 +10,7 @@ if [ ${CUDA_VERSION} = "12.8" ]; then
    TORCH_INSTALL="pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION//.}"
 else
    DOCKER_IMAGE="pytorch/manylinux-builder:cuda${CUDA_VERSION}"
-   TORCH_INSTALL="pip install --no-cache-dir torch==2.6.0 --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION//.}"
+   TORCH_INSTALL="pip install --no-cache-dir torch==2.5.1 --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION//.}"
 fi
 
 docker run --rm \