vajain-rhods
diff --git a/‎.buildkite/release-pipeline.yaml
Lines changed: 1 addition & 1 deletion b/‎.buildkite/release-pipeline.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 3 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 3 additions & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 2 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/CODEOWNERS
Lines changed: 2 additions & 0 deletions b/‎.github/CODEOWNERS
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 5 additions & 1 deletion b/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎csrc/cutlass_extensions/common.hpp
Lines changed: 0 additions & 9 deletions b/‎csrc/cutlass_extensions/common.hpp
Lines changed: 0 additions & 9 deletions
diff --git a/‎csrc/moe/torch_bindings.cpp
Lines changed: 1 addition & 1 deletion b/‎csrc/moe/torch_bindings.cpp
Lines changed: 1 addition & 1 deletion
@@ -64,7 +64,7 @@ steps:
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
     plugins:
       - docker-login#v3.0.0:
-          username: vllm
+          username: vllmbot
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
+HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 
 # Try building the docker image
-aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
 
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
@@ -47,6 +48,7 @@ trap remove_docker_container EXIT
 docker run --rm -it --device=/dev/neuron0 --network bridge \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
+       -e "HF_TOKEN=${HF_TOKEN}" \
        -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
 
@@ -138,6 +138,7 @@ steps:
   - vllm/core/
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
+  - tests/distributed/test_events
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile/test_basic_correctness
   - examples/offline_inference/rlhf.py
@@ -156,6 +157,7 @@ steps:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
 
@@ -13,6 +13,7 @@
 /vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
+/vllm/lora @jeejeelee
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
@@ -40,3 +41,4 @@ CMakeLists.txt @tlrmchlsmth
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
 /tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
+/tests/lora @jeejeelee
@@ -84,7 +84,10 @@ def main(
     if version == "v2":
         if current_platform.is_rocm():
             global PARTITION_SIZE
-            PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
+            if not args.custom_paged_attn and not current_platform.is_navi():
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
         num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
         tmp_output = torch.empty(
             size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -159,6 +162,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         scale,
                         block_tables,
                         seq_lens,
+                        None,
                         block_size,
                         max_seq_len,
                         alibi_slopes,
 
@@ -15,15 +15,6 @@
                 cutlassGetStatusString(error));     \
   }
 
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                        \
-  {                                                               \
-    cudaError_t error = status;                                   \
-    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
-  }
-
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
 
@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   // Calculate the result of moe by summing up the partial results
   // from all selected experts.
-  m.def("moe_sum(Tensor! input, Tensor output) -> ()");
+  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
   m.impl("moe_sum", torch::kCUDA, &moe_sum);
 
   // Aligning the number of tokens to be processed by each expert such