Skip to content

Commit a6275cd

Browse files
committed
Merge remote-tracking branch 'upstream/main' into upstream-sync-2025-05-21-notag
2 parents e540432 + dd5fa7e commit a6275cd

File tree

78 files changed

+3544
-704
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+3544
-704
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ steps:
6464
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
6565
plugins:
6666
- docker-login#v3.0.0:
67-
username: vllm
67+
username: vllmbot
6868
password-env: DOCKERHUB_TOKEN
6969
env:
7070
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
1111
HF_CACHE="$(realpath ~)/huggingface"
1212
mkdir -p "${HF_CACHE}"
1313
HF_MOUNT="/root/.cache/huggingface"
14+
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
1415

1516
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
1617
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
1718
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
1819

1920
# Try building the docker image
20-
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
21+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
2122

2223
# prune old image and containers to save disk space, and only once a day
2324
# by using a timestamp file in tmp.
@@ -47,6 +48,7 @@ trap remove_docker_container EXIT
4748
docker run --rm -it --device=/dev/neuron0 --network bridge \
4849
-v "${HF_CACHE}:${HF_MOUNT}" \
4950
-e "HF_HOME=${HF_MOUNT}" \
51+
-e "HF_TOKEN=${HF_TOKEN}" \
5052
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
5153
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
5254
--name "${container_name}" \

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ steps:
138138
- vllm/core/
139139
- tests/distributed/test_utils
140140
- tests/distributed/test_pynccl
141+
- tests/distributed/test_events
141142
- tests/spec_decode/e2e/test_integration_dist_tp4
142143
- tests/compile/test_basic_correctness
143144
- examples/offline_inference/rlhf.py
@@ -156,6 +157,7 @@ steps:
156157
- pytest -v -s distributed/test_utils.py
157158
- pytest -v -s compile/test_basic_correctness.py
158159
- pytest -v -s distributed/test_pynccl.py
160+
- pytest -v -s distributed/test_events.py
159161
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
160162
# TODO: create a dedicated test section for multi-GPU example tests
161163
# when we have multiple distributed example tests

.github/CODEOWNERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
/vllm/model_executor/guided_decoding @mgoin @russellb
1414
/vllm/multimodal @DarkLight1337 @ywang96
1515
/vllm/vllm_flash_attn @LucasWilkinson
16+
/vllm/lora @jeejeelee
1617
CMakeLists.txt @tlrmchlsmth
1718

1819
# vLLM V1
@@ -40,3 +41,4 @@ CMakeLists.txt @tlrmchlsmth
4041
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
4142
/tests/v1/structured_output @mgoin @russellb
4243
/tests/weight_loading @mgoin @youkaichao
44+
/tests/lora @jeejeelee

benchmarks/kernels/benchmark_paged_attention.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,10 @@ def main(
8484
if version == "v2":
8585
if current_platform.is_rocm():
8686
global PARTITION_SIZE
87-
PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
87+
if not args.custom_paged_attn and not current_platform.is_navi():
88+
PARTITION_SIZE = 1024
89+
else:
90+
PARTITION_SIZE = PARTITION_SIZE_ROCM
8891
num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
8992
tmp_output = torch.empty(
9093
size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -159,6 +162,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
159162
scale,
160163
block_tables,
161164
seq_lens,
165+
None,
162166
block_size,
163167
max_seq_len,
164168
alibi_slopes,

csrc/cutlass_extensions/common.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,6 @@
1515
cutlassGetStatusString(error)); \
1616
}
1717

18-
/**
19-
* Panic wrapper for unwinding CUDA runtime errors
20-
*/
21-
#define CUDA_CHECK(status) \
22-
{ \
23-
cudaError_t error = status; \
24-
TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
25-
}
26-
2718
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
2819
int max_shared_mem_per_block_opt_in = 0;
2920
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,

csrc/moe/torch_bindings.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
1010

1111
// Calculate the result of moe by summing up the partial results
1212
// from all selected experts.
13-
m.def("moe_sum(Tensor! input, Tensor output) -> ()");
13+
m.def("moe_sum(Tensor input, Tensor! output) -> ()");
1414
m.impl("moe_sum", torch::kCUDA, &moe_sum);
1515

1616
// Aligning the number of tokens to be processed by each expert such

0 commit comments

Comments
 (0)