sgl-project
diff --git a/‎docs/advanced_features/server_arguments.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/advanced_features/server_arguments.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/advanced_features/sgl_model_gateway.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/advanced_features/sgl_model_gateway.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/basic_usage/native_api.ipynb‎
Lines changed: 2 additions & 2 deletions b/‎docs/basic_usage/native_api.ipynb‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/developer_guide/bench_serving.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/developer_guide/bench_serving.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sglang/bench_serving.py‎
Lines changed: 2 additions & 2 deletions b/‎python/sglang/bench_serving.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/sglang/lang/backend/runtime_endpoint.py‎
Lines changed: 2 additions & 2 deletions b/‎python/sglang/lang/backend/runtime_endpoint.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/sglang/profiler.py‎
Lines changed: 1 addition & 1 deletion b/‎python/sglang/profiler.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sglang/test/bench_one_batch_server_internal.py‎
Lines changed: 2 additions & 2 deletions b/‎python/sglang/test/bench_one_batch_server_internal.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/sglang/test/kits/cache_hit_kit.py‎
Lines changed: 1 addition & 1 deletion b/‎python/sglang/test/kits/cache_hit_kit.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/sglang/test/kl_test_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎python/sglang/test/kl_test_utils.py‎
Lines changed: 2 additions & 2 deletions
@@ -212,7 +212,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | Argument | Description | Defaults | Options |
 | --- | --- | --- | --- |
 | `--api-key` | Set API key of the server. It is also used in the OpenAI API compatible server. | `None` | Type: str |
-| `--admin-api-key` | Set **admin API key** for administrative/control endpoints (e.g., weights update, cache flush, `/get_server_info`). Endpoints marked as admin-only require `Authorization: Bearer <admin_api_key>` when this is set. | `None` | Type: str |
+| `--admin-api-key` | Set **admin API key** for administrative/control endpoints (e.g., weights update, cache flush, `/server_info`). Endpoints marked as admin-only require `Authorization: Bearer <admin_api_key>` when this is set. | `None` | Type: str |
 | `--served-model-name` | Override the model name returned by the v1/models endpoint in OpenAI API server. | `None` | Type: str |
 | `--weight-version` | Version identifier for the model weights. Defaults to 'default' if not specified. | `default` | Type: str |
 | `--chat-template` | The builtin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server. | `None` | Type: str |
 
@@ -77,7 +77,7 @@ SGLang Model Gateway is a high-performance model-routing gateway for large-scale
 
 ### Control Plane
 
-- **Worker Manager** discovers capabilities (`/get_server_info`, `/get_model_info`), tracks load, and registers/removes workers in the shared registry.
+- **Worker Manager** discovers capabilities (`/server_info`, `/get_model_info`), tracks load, and registers/removes workers in the shared registry.
 - **Job Queue** serializes add/remove requests and exposes status (`/workers/{worker_id}`) so clients can track onboarding progress.
 - **Load Monitor** feeds cache-aware and power-of-two policies with live worker load statistics.
 - **Health Checker** continuously probes workers and updates readiness, circuit breaker state, and router metrics.
@@ -552,7 +552,7 @@ Response:
 | `GET` | `/engine_metrics` | Engine-level metrics from workers |
 | `GET` | `/v1/models` | List available models |
 | `GET` | `/get_model_info` | Get model information |
-| `GET` | `/get_server_info` | Get server information |
+| `GET` | `/server_info` | Get server information |
 | `POST` | `/flush_cache` | Clear all caches |
 | `GET` | `/get_loads` | Get all worker loads |
 | `POST` | `/wasm` | Upload WASM module |
 
@@ -10,7 +10,7 @@
     "\n",
     "- `/generate` (text generation model)\n",
     "- `/get_model_info`\n",
-    "- `/get_server_info`\n",
+    "- `/server_info`\n",
     "- `/health`\n",
     "- `/health_generate`\n",
     "- `/flush_cache`\n",
@@ -140,7 +140,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "url = f\"http://localhost:{port}/get_server_info\"\n",
+    "url = f\"http://localhost:{port}/server_info\"\n",
     "\n",
     "response = requests.get(url)\n",
     "print_highlight(response.text)"
 
@@ -352,4 +352,4 @@ python3 -m sglang.bench_serving \
 ### Notes
 
 - The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections.
-- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available.
+- For sglang, `/server_info` is queried post-run to report speculative decoding accept length when available.
@@ -1402,7 +1402,7 @@ async def limited_request_func(request_func_input, pbar):
 
     if "sglang" in backend:
         server_info = requests.get(
-            base_url + "/get_server_info", headers=get_auth_headers()
+            base_url + "/server_info", headers=get_auth_headers()
         )
         if server_info.status_code == 200:
             server_info_json = server_info.json()
@@ -1538,7 +1538,7 @@ async def limited_request_func(request_func_input, pbar):
         print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms))
     print("=" * 50)
 
-    resp = requests.get(base_url + "/get_server_info", headers=get_auth_headers())
+    resp = requests.get(base_url + "/server_info", headers=get_auth_headers())
     server_info = resp.json() if resp.status_code == 200 else None
 
     if (
 
@@ -67,7 +67,7 @@ def flush_cache(self):
 
     def get_server_info(self):
         res = http_request(
-            self.base_url + "/get_server_info",
+            self.base_url + "/server_info",
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -531,7 +531,7 @@ def encode(
 
     async def get_server_info(self):
         async with aiohttp.ClientSession() as session:
-            async with session.get(f"{self.url}/get_server_info") as response:
+            async with session.get(f"{self.url}/server_info") as response:
                 if response.status == 200:
                     return await response.json()
                 else:
 
@@ -42,7 +42,7 @@ def run_profile(
     # Dump server args.
     file_path = Path(output_dir) / "server_args.json"
     if not file_path.exists():
-        response = requests.get(url + "/get_server_info")
+        response = requests.get(url + "/server_info")
         response.raise_for_status()
         server_args_data = response.json()
         with open(file_path, "w") as file:
 
@@ -609,7 +609,7 @@ def run_one_case(
         last_gen_throughput = -1
         acc_length = -1
     else:
-        response = requests.get(url + "/get_server_info", timeout=DEFAULT_TIMEOUT)
+        response = requests.get(url + "/server_info", timeout=DEFAULT_TIMEOUT)
         response.raise_for_status()
         server_info = response.json()
         internal_state = server_info.get("internal_states", [{}])
@@ -793,7 +793,7 @@ def run_benchmark_internal(
         skip_max_running_requests_threshold = float("inf")
     else:
         model_name = None
-        response = requests.get(base_url + "/get_server_info", timeout=DEFAULT_TIMEOUT)
+        response = requests.get(base_url + "/server_info", timeout=DEFAULT_TIMEOUT)
         response.raise_for_status()
         server_info = response.json()
         if "tokenizer_path" in server_info:
 
@@ -221,7 +221,7 @@ async def _send_one(payload):
 def _get_page_size(base_url: str) -> int:
     """Query server for page_size used by radix cache."""
     try:
-        resp = requests.get(f"{base_url}/get_server_info", timeout=10)
+        resp = requests.get(f"{base_url}/server_info", timeout=10)
         resp.raise_for_status()
         info = resp.json()
         return info.get("page_size", 1)
 
@@ -208,7 +208,7 @@ def test_input_output_logprobs_match_helper(
 def test_input_output_logprobs_match_prefill_cache_hit_helper(
     base_url, ACC_THRESHOLDS, model_name, max_samples=None, max_new_tokens=8192
 ):
-    server_info = requests.get(base_url + "/get_server_info").json()
+    server_info = requests.get(base_url + "/server_info").json()
     if server_info["disable_radix_cache"]:
         print("Radix cache is disabled, skipping test")
         return
@@ -261,7 +261,7 @@ def test_input_output_logprobs_match_prefill_cache_hit_helper(
 def test_input_output_logprobs_match_decode_cache_hit_helper(
     base_url, ACC_THRESHOLDS, model_name, max_samples=None, max_new_tokens=8192
 ):
-    server_info = requests.get(base_url + "/get_server_info").json()
+    server_info = requests.get(base_url + "/server_info").json()
     if server_info["disable_radix_cache"]:
         print("Radix cache is disabled, skipping test")
         return