Skip to content

Commit a30dfc7

Browse files
authored
Merge pull request #1 from kvcache-ai/main
Enhance Store and TransferEngine with health check, metrics, and NUMA support
2 parents 99e9ece + 8e40563 commit a30dfc7

76 files changed

Lines changed: 6903 additions & 2047 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yml

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -347,17 +347,6 @@ jobs:
347347
kill $SERVER_PID 2>/dev/null || true
348348
wait $SERVER_PID 2>/dev/null || true
349349
350-
- name: Test Mooncake EP Backend (CPU Only)
351-
env:
352-
MC_FORCE_TCP: "true"
353-
run: |
354-
source test_env/bin/activate
355-
python -m unittest mooncake-wheel.tests.test_mooncake_backend_cpu
356-
# Disable these tests in CI as they fail occasionally.
357-
# python -m unittest mooncake-wheel.tests.test_mooncake_backend_elastic
358-
# python -m unittest mooncake-wheel.tests.test_mooncake_backend_p2p_cpu
359-
shell: bash
360-
361350
- name: Test Safetensor Functions
362351
run: |
363352
source test_env/bin/activate
@@ -377,7 +366,6 @@ jobs:
377366
env:
378367
CI: "true"
379368
BUILD_WITH_EP: "1"
380-
EP_TORCH_VERSIONS: "2.9.0;2.9.1;2.10.0"
381369
TORCH_CUDA_ARCH_LIST: "8.0;9.0"
382370
SCCACHE_GHA_ENABLED: "true"
383371

@@ -444,7 +432,7 @@ jobs:
444432
run: |
445433
mkdir build
446434
cd build
447-
cmake .. -DUSE_ETCD=ON -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_EP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=ON -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
435+
cmake .. -DUSE_ETCD=ON -DUSE_CXL=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=ON -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
448436
shell: bash
449437
# TODO: lack USE_NVMEOF,USE_MNNVL
450438

@@ -478,7 +466,7 @@ jobs:
478466
run: |
479467
cd build
480468
rm -r */tests
481-
cmake .. -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF -DUSE_HTTP=ON -DENABLE_SCCACHE=ON -DUSE_CXL=ON
469+
cmake .. -DBUILD_UNIT_TESTS=OFF -DBUILD_EXAMPLES=OFF -DUSE_HTTP=ON -DENABLE_SCCACHE=ON -DUSE_CXL=ON -DWITH_EP=ON -DEP_TORCH_VERSIONS="2.9.0;2.9.1;2.10.0"
482470
shell: bash
483471

484472
- name: Build project

.github/workflows/ci_ascend.yml

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
name: 'CI Test on ASCEND Platform'
2+
3+
on:
4+
push:
5+
branches: [ "main" ]
6+
pull_request:
7+
branches: [ "main" ]
8+
types: [opened, synchronize, reopened, labeled]
9+
10+
jobs:
11+
build-and-test:
12+
if: github.repository == 'kvcache-ai/Mooncake'
13+
runs-on: self-hosted
14+
15+
container:
16+
image: localhost:5000/mooncake-hixl-ci:v5
17+
options: --privileged --user 0:0 --device /dev/davinci0 --device /dev/davinci1 --device /dev/davinci2 --device /dev/davinci3
18+
--device /dev/davinci4 --device /dev/davinci5 --device /dev/davinci6 --device /dev/davinci7
19+
--device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc --ulimit nproc=65535:65535
20+
env:
21+
GITHUB_ACTIONS: "true"
22+
LD_PRELOAD: "/usr/lib64/libjemalloc.so.2:"
23+
volumes:
24+
- /usr/local/dcmi:/usr/local/dcmi
25+
- /usr/local/Ascend/driver/:/usr/local/Ascend/driver/
26+
- /etc/ascend_install.info:/etc/ascend_install.info
27+
- /etc/hccn.conf:/etc/hccn.conf
28+
29+
steps:
30+
- name: Checkout code
31+
uses: actions/checkout@v4
32+
with:
33+
fetch-depth: 1
34+
persist-credentials: false
35+
36+
- name: Configure CMake
37+
shell: bash
38+
run: |
39+
source /usr/local/Ascend/cann-9.0.0/set_env.sh
40+
pwd
41+
if ! git submodule update --init --recursive; then
42+
if [ ! -d "extern/pybind11" ] || [ -z "$(ls -A 'extern/pybind11' 2>/dev/null)" ]; then
43+
echo "git submodule update failed, try to cp pybind11..."
44+
if [ -d "../pybind11" ]; then
45+
cp -r ../pybind11 extern/
46+
else
47+
echo "Error: ../pybind11 does not exist. Cannot copy pybind11."
48+
exit 1
49+
fi
50+
else
51+
echo "Detected that extern/pybind11 already exists, continuing execution...."
52+
fi
53+
fi
54+
echo "Configuring CMake..."
55+
rm -rf build
56+
mkdir -p build
57+
cd build
58+
59+
cmake .. \
60+
-DUSE_ASCEND_DIRECT=ON \
61+
-DBUILD_EXAMPLES=OFF \
62+
-DBUILD_UNIT_TESTS=OFF
63+
64+
- name: Build
65+
shell: bash
66+
run: |
67+
source /usr/local/Ascend/cann-9.0.0/set_env.sh
68+
echo "Building..."
69+
cd build
70+
make -j8
71+
make install -j8
72+
echo "Mooncake installed successfully."
73+
74+
- name: Run Hixl Mooncake Store Test
75+
shell: bash
76+
run: |
77+
source /usr/local/Ascend/cann-9.0.0/set_env.sh
78+
set -e
79+
export ASCEND_PROCESS_LOG_PATH=/tmp/hixl-test-log/
80+
export ASCEND_GLOBAL_LOG_LEVEL=3
81+
echo "=== Cloning Hixl repository ==="
82+
cd ..
83+
rm -rf hixl
84+
git clone https://gitcode.com/cann/hixl.git
85+
cd hixl/examples/third_parties/mooncake_store/python/
86+
87+
export LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
88+
echo "=== Starting Mooncake Master ==="
89+
90+
# Find mooncake_master binary
91+
MOONCAKE_MASTER=$(find /usr/local/bin /usr/bin -name "mooncake_master" -type f 2>/dev/null | head -1)
92+
if [ -z "$MOONCAKE_MASTER" ]; then
93+
# Try finding in build directory
94+
MOONCAKE_MASTER=$(find $GITHUB_WORKSPACE/build -name "mooncake_master" -type f 2>/dev/null | head -1)
95+
fi
96+
97+
if [ -z "$MOONCAKE_MASTER" ]; then
98+
echo "Error: mooncake_master binary not found"
99+
exit 1
100+
fi
101+
102+
echo "Found mooncake_master at: $MOONCAKE_MASTER"
103+
104+
# Start Mooncake master in background
105+
$MOONCAKE_MASTER \
106+
--enable_http_metadata_server=true \
107+
--http_metadata_server_host=0.0.0.0 \
108+
--http_metadata_server_port=8080 \
109+
> /tmp/mooncake_master.log 2>&1 &
110+
MASTER_PID=$!
111+
echo "Mooncake Master started with PID: $MASTER_PID"
112+
113+
# Wait for master to be ready
114+
echo "Waiting for Mooncake Master to initialize..."
115+
sleep 5
116+
117+
# Check if master is running
118+
if ! kill -0 $MASTER_PID 2>/dev/null; then
119+
echo "Error: Mooncake Master failed to start"
120+
cat /tmp/mooncake_master.log
121+
exit 1
122+
fi
123+
124+
echo "Mooncake Master is running"
125+
echo "=== Running Hixl Mooncake Store Tests ==="
126+
# List of test cases to run
127+
TEST_CASES=(
128+
"batch_put_get_sample.py"
129+
"batch_put_get_multi_buffers_sample.py"
130+
)
131+
132+
# Track test results
133+
FAILED_TESTS=()
134+
PASSED_TESTS=()
135+
136+
# Run each test case
137+
export ASCEND_BUFFER_POOL=4:8
138+
for test_case in "${TEST_CASES[@]}"; do
139+
echo ""
140+
echo "========================================="
141+
echo "Running test: $test_case"
142+
echo "========================================="
143+
144+
if [ ! -f "$test_case" ]; then
145+
echo "Warning: Test file $test_case not found, skipping..."
146+
continue
147+
fi
148+
# Run the test with run.sh
149+
python3 $test_case \
150+
--device_id=0 \
151+
--rank=0 \
152+
2>&1 | tee "/tmp/hixl_test_${test_case%.py}.log"
153+
154+
TEST_RESULT=${PIPESTATUS[0]}
155+
156+
if [ $TEST_RESULT -eq 0 ]; then
157+
echo "✓ $test_case PASSED"
158+
PASSED_TESTS+=("$test_case")
159+
else
160+
echo "✗ $test_case FAILED with exit code: $TEST_RESULT"
161+
FAILED_TESTS+=("$test_case")
162+
fi
163+
done
164+
165+
echo ""
166+
echo "========================================="
167+
echo "Test Summary"
168+
echo "========================================="
169+
echo "Passed tests: ${#PASSED_TESTS[@]}"
170+
for test in "${PASSED_TESTS[@]}"; do
171+
echo " ✓ $test"
172+
done
173+
174+
echo ""
175+
echo "Failed tests: ${#FAILED_TESTS[@]}"
176+
for test in "${FAILED_TESTS[@]}"; do
177+
echo " ✗ $test"
178+
done
179+
180+
# Cleanup: Stop Mooncake Master
181+
echo ""
182+
echo "Stopping Mooncake Master..."
183+
kill $MASTER_PID 2>/dev/null || true
184+
wait $MASTER_PID 2>/dev/null || true
185+
186+
# Exit with error if any tests failed
187+
if [ ${#FAILED_TESTS[@]} -gt 0 ]; then
188+
echo ""
189+
echo "Some tests failed!"
190+
exit 1
191+
fi
192+
193+
echo ""
194+
echo "All Hixl Mooncake Store tests completed successfully!"
195+
196+
197+
- name: Test Summary
198+
if: always()
199+
shell: bash
200+
run: |
201+
echo "CI Test completed"
202+
203+
- name: Upload Test Logs
204+
if: always()
205+
uses: actions/upload-artifact@v4
206+
with:
207+
name: test-logs-${{ github.run_number }}
208+
path: |
209+
/tmp/hixl-test-log/*
210+
retention-days: 30
211+
if-no-files-found: warn

.github/workflows/ci_cu13.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ jobs:
2020
env:
2121
BUILD_WITH_EP: "1"
2222
CU13_BUILD: "1"
23-
EP_TORCH_VERSIONS: "2.9.0;2.9.1;2.10.0"
2423
TORCH_CUDA_ARCH_LIST: "8.0;9.0"
2524
SCCACHE_GHA_ENABLED: "true"
2625

@@ -81,6 +80,7 @@ jobs:
8180
-DWITH_STORE=ON \
8281
-DWITH_P2P_STORE=ON \
8382
-DWITH_EP=ON \
83+
-DEP_TORCH_VERSIONS="2.9.0;2.9.1;2.10.0" \
8484
-DWITH_METRICS=ON \
8585
-DBUILD_UNIT_TESTS=OFF \
8686
-DBUILD_EXAMPLES=ON \

.github/workflows/integration-test.yml

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,21 @@ name: 'Integration test (Linux)'
33
on:
44
push:
55
branches: [ "main" ]
6+
paths:
7+
- 'mooncake-*/**'
8+
- 'extern/**'
9+
- 'CMakeLists.txt'
10+
- 'scripts/**'
11+
- '.github/workflows/**'
612
pull_request_target:
713
branches: [ "main" ]
814
types: [opened, synchronize, reopened, labeled]
9-
15+
paths:
16+
- 'mooncake-*/**'
17+
- 'extern/**'
18+
- 'CMakeLists.txt'
19+
- 'scripts/**'
20+
- '.github/workflows/**'
1021

1122
jobs:
1223
test-sglang-integration:
@@ -18,14 +29,18 @@ jobs:
1829
if: ${{ env.tone_user_name != '' }}
1930
run: |
2031
SHA="${{ github.event.pull_request.head.sha }}"
32+
PR_ID="${{ github.event.pull_request.number }}"
33+
2134
if [ "${{ github.event_name }}" = "push" ]; then
2235
SHA="${{ github.sha }}"
36+
PR_ID=""
2337
fi
38+
echo "PR_ID=${PR_ID}"
2439
max_attempts=120
2540
attempt=1
2641
while [ $attempt -le $max_attempts ]; do
2742
echo "Attempt $attempt: Fetching artifact..."
28-
if curl -L -fs -o artifact.json -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/${{ github.repository }}/actions/artifacts; then
43+
if curl -L -fs -o artifact.json -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/${{ github.repository }}/actions/artifacts?per_page=100; then
2944
artifact_id=""
3045
if jq empty artifact.json >/dev/null 2>&1; then
3146
artifact_id=$(jq -r ".artifacts[] | select(.name | contains(\"py312\") ) | select(.name | contains(\"cu130\") | not) | select(.workflow_run.head_sha == \"$SHA\" ) | .id" artifact.json | head -n 1)
@@ -53,9 +68,14 @@ jobs:
5368
echo "Failed to fetch artifacts after $max_attempts attempts"
5469
exit 1
5570
fi
71+
72+
ENV_INFO="ARTIFACT_ID=${artifact_id} GIT_REPO=${{ github.repository }}"
73+
if [ -n "$PR_ID" ]; then
74+
ENV_INFO="${ENV_INFO} PR_ID=${PR_ID}"
75+
fi
5676
signature="${{ secrets.TONE_USER_NAME }}|${{ secrets.TONE_USER_TOKEN }}|$(python3 -c "import time;print(time.time())")"
5777
signature="$(python3 -c "import base64;print(base64.b64encode(\"$signature\".encode('utf-8')).decode('utf-8'))")"
58-
curl -s -H 'Content-Type: application/json' -X POST -d "{\"workspace\":\"mooncake_test\",\"project\":\"mooncake-ci\",\"template\":\"mooncake-ci-test\",\"name\":\"mooncake-ci-${SHA}\",\"username\":\"${{ secrets.TONE_USER_NAME }}\",\"env_ifs\":\" \",\"env_info\":\"ARTIFACT_ID=${artifact_id} GIT_REPO=${{ github.repository }}\",\"signature\":\"$signature\"}" https://tone.openanolis.cn/api/job/create/ > job.json
78+
curl -s -H 'Content-Type: application/json' -X POST -d "{\"workspace\":\"mooncake_test\",\"project\":\"mooncake-ci\",\"template\":\"mooncake-ci-test\",\"name\":\"mooncake-ci-${SHA}\",\"username\":\"${{ secrets.TONE_USER_NAME }}\",\"env_ifs\":\" \",\"env_info\":\"${ENV_INFO}\",\"signature\":\"$signature\"}" https://tone.openanolis.cn/api/job/create/ > job.json
5979
if [ "$(jq .code job.json)" == 200 ]; then
6080
echo "job created"
6181
else

.github/workflows/release-cuda13.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ jobs:
1818
env:
1919
BUILD_WITH_EP: "1"
2020
CU13_BUILD: "1"
21-
EP_TORCH_VERSIONS: "2.9.0;2.9.1;2.10.0"
2221
TORCH_CUDA_ARCH_LIST: "8.0;9.0"
2322
steps:
2423
- name: Checkout source
@@ -66,7 +65,7 @@ jobs:
6665
sudo bash -x dependencies.sh -y
6766
mkdir build
6867
cd build
69-
cmake .. -DBUILD_UNIT_TESTS=OFF -DUSE_HTTP=ON -DUSE_ETCD=ON -DUSE_CUDA=ON -DWITH_EP=ON -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
68+
cmake .. -DBUILD_UNIT_TESTS=OFF -DUSE_HTTP=ON -DUSE_ETCD=ON -DUSE_CUDA=ON -DWITH_EP=ON -DEP_TORCH_VERSIONS="2.9.0;2.9.1;2.10.0" -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
7069
shell: bash
7170

7271
- name: Build project

.github/workflows/release.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ jobs:
1717
python-version: ['3.10', '3.11', '3.12', '3.13']
1818
env:
1919
BUILD_WITH_EP: "1"
20-
EP_TORCH_VERSIONS: "2.9.0;2.9.1;2.10.0"
2120
TORCH_CUDA_ARCH_LIST: "8.0;9.0"
2221
steps:
2322
- name: Checkout source
@@ -65,7 +64,7 @@ jobs:
6564
sudo bash -x dependencies.sh -y
6665
mkdir build
6766
cd build
68-
cmake .. -DBUILD_UNIT_TESTS=OFF -DUSE_HTTP=ON -DUSE_ETCD=ON -DUSE_CUDA=ON -DWITH_EP=ON -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
67+
cmake .. -DBUILD_UNIT_TESTS=OFF -DUSE_HTTP=ON -DUSE_ETCD=ON -DUSE_CUDA=ON -DWITH_EP=ON -DEP_TORCH_VERSIONS="2.9.0;2.9.1;2.10.0" -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
6968
shell: bash
7069

7170
- name: Build project

0 commit comments

Comments
 (0)