Skip to content

Commit 1f99e45

Browse files
committed
feat: add llama backend + thinking loop detection for cross-platform GPU inference
- backend_llama.py: new LlamaBackend using llama-cpp-python for GGUF models - Cross-platform: macOS (CPU/GPU), Windows (CUDA), Linux - Windows CUDA DLL auto-discovery (nvidia-cublas-cu12 pip package) - Qwen3 thinking mode control via /no_think prompt injection - Built-in loop detection integration (streaming + post-hoc) - loop_detector.py: first-in-framework thinking loop detection - Jaccard-based semantic similarity at line boundaries - Catches patterns token-level repeat_penalty misses - Three actions: escape, stop, warn - Independent module, usable by any backend - hippo_api.py: new CLI args (--mode llama, --gguf-path, --n-gpu-layers, --thinking, --loop-detect, --loop-detect-window/threshold/action) - Tests: 18 new tests (11 backend_llama + 7 loop_detector), all passing - Validated on RTX 5060 Ti 16GB: Qwen3-30B-A3B at 85 tok/s, 0/20 loops Ref: llama.cpp #21264 (community feature request, unimplemented)
1 parent ba7d720 commit 1f99e45

5 files changed

Lines changed: 668 additions & 1 deletion

File tree

pipeline/backend_llama.py

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#!/usr/bin/env python3
2+
"""
3+
backend_llama.py — LlamaBackend for Hippo Pipeline.
4+
5+
Uses llama-cpp-python to run GGUF models on Windows/Linux/macOS with GPU support.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import asyncio
11+
import glob
12+
import os
13+
import time
14+
15+
16+
def _init_cuda_dlls():
17+
"""Auto-discover CUDA DLLs on Windows (nvidia-cublas-cu12 pip package)."""
18+
if os.name != 'nt':
19+
return
20+
import site
21+
try:
22+
sp = site.getsitepackages()
23+
# Check user site-packages too
24+
user_sp = [site.getusersitepackages()] if hasattr(site, 'getusersitepackages') else []
25+
all_sp = sp + user_sp
26+
except Exception:
27+
return
28+
29+
for base in all_sp:
30+
nvidia_dir = os.path.join(base, 'nvidia')
31+
if not os.path.isdir(nvidia_dir):
32+
continue
33+
for sub in os.listdir(nvidia_dir):
34+
bin_dir = os.path.join(nvidia_dir, sub, 'bin')
35+
if os.path.isdir(bin_dir) and any(f.endswith('.dll') for f in os.listdir(bin_dir)):
36+
try:
37+
os.add_dll_directory(bin_dir)
38+
except (OSError, FileNotFoundError):
39+
pass
40+
41+
42+
def _find_gguf(model_name: str) -> str | None:
43+
"""Search common directories for a GGUF file matching model_name."""
44+
search_dirs = [
45+
os.path.expanduser("~/.cache/huggingface"),
46+
os.path.expanduser("~/.cache/modelscope"),
47+
os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "models"),
48+
os.path.join(os.path.dirname(os.path.abspath(__file__)), "models"),
49+
".",
50+
]
51+
for d in search_dirs:
52+
if not os.path.isdir(d):
53+
continue
54+
for f in glob.glob(os.path.join(d, "**", "*.gguf"), recursive=True):
55+
base = os.path.basename(f).lower()
56+
name_lower = model_name.lower().replace("/", "_")
57+
if name_lower in base or model_name.lower() in base:
58+
return f
59+
return None
60+
61+
62+
class LlamaBackend:
63+
"""Backend using llama-cpp-python for GGUF model inference.
64+
65+
Duck-type compatible with HippoBackend (same interface).
66+
"""
67+
68+
def __init__(self, cfg: dict):
69+
self.cfg = cfg
70+
self.mode = cfg.get("mode", "llama")
71+
self.model_name = cfg.get("model", "")
72+
self._ready = False
73+
self._llama = None
74+
self._model_path = cfg.get("gguf_path") or cfg.get("model", "")
75+
self._n_gpu_layers = int(cfg.get("n_gpu_layers", -1))
76+
self._n_ctx = int(cfg.get("n_ctx", 4096))
77+
self._thinking = cfg.get("thinking", False)
78+
79+
# Loop detection for thinking models
80+
self._loop_detect = cfg.get("loop_detect", False)
81+
self._loop_detector = None
82+
if self._loop_detect:
83+
from loop_detector import LoopDetector
84+
self._loop_detector = LoopDetector(
85+
window=cfg.get("loop_detect_window", 20),
86+
threshold=cfg.get("loop_detect_threshold", 3),
87+
similarity=cfg.get("loop_detect_similarity", 0.7),
88+
action=cfg.get("loop_detect_action", "escape"),
89+
)
90+
91+
async def ready(self) -> bool:
92+
if self._llama is not None:
93+
return True
94+
_init_cuda_dlls()
95+
try:
96+
from llama_cpp import Llama
97+
except ImportError:
98+
print("❌ llama-cpp-python not installed. Run: pip install llama-cpp-python")
99+
return False
100+
101+
# Resolve model path
102+
path = self._model_path
103+
if path and not os.path.isfile(path):
104+
found = _find_gguf(path)
105+
if found:
106+
path = found
107+
else:
108+
print(f"❌ GGUF model not found: {self._model_path}")
109+
return False
110+
111+
if not path:
112+
print("❌ No model path specified. Use --gguf-path or --model.")
113+
return False
114+
115+
if not os.path.isfile(path):
116+
print(f"❌ GGUF file not found: {path}")
117+
return False
118+
119+
print(f"🔄 Loading GGUF model: {path} (n_gpu_layers={self._n_gpu_layers}, n_ctx={self._n_ctx})")
120+
loop = asyncio.get_event_loop()
121+
self._llama = await loop.run_in_executor(
122+
None,
123+
lambda: Llama(
124+
model_path=path,
125+
n_gpu_layers=self._n_gpu_layers,
126+
n_ctx=self._n_ctx,
127+
verbose=False,
128+
),
129+
)
130+
print(f"✅ Model loaded: {os.path.basename(path)}")
131+
self._ready = True
132+
return True
133+
134+
async def generate(self, messages: list[dict], max_tokens: int = 256,
135+
temperature: float = 0.0, stream: bool = False):
136+
if not self._llama:
137+
raise RuntimeError("Model not loaded. Call ready() first.")
138+
139+
loop = asyncio.get_event_loop()
140+
t0 = time.time()
141+
142+
if stream:
143+
return self._generate_stream(messages, max_tokens, temperature, t0)
144+
145+
# Disable Qwen3 thinking mode by appending /no_think to last user message
146+
if "qwen3" in self.model_name.lower() and not self._thinking:
147+
messages = list(messages) # shallow copy
148+
if messages and messages[-1].get("role") == "user":
149+
messages[-1] = {**messages[-1], "content": messages[-1]["content"].rstrip() + " /no_think"}
150+
151+
result = await loop.run_in_executor(
152+
None,
153+
lambda: self._llama.create_chat_completion(
154+
messages=messages,
155+
max_tokens=max_tokens,
156+
temperature=temperature,
157+
),
158+
)
159+
elapsed = time.time() - t0
160+
choice = result.get("choices", [{}])[0]
161+
text = choice.get("message", {}).get("content", "")
162+
usage = result.get("usage", {})
163+
tokens = usage.get("completion_tokens", 0)
164+
165+
# Post-hoc loop detection for non-streaming mode
166+
if self._loop_detector:
167+
loop_result = self._loop_detector.check_text(text)
168+
if loop_result:
169+
print(f"[LOOP] Loop detected in output: {loop_result['line'][:60]}... (action={loop_result['action']})")
170+
self._loop_detector.reset()
171+
172+
return {
173+
"text": text,
174+
"tokens": tokens,
175+
"tok_s": tokens / max(elapsed, 0.001),
176+
"ar": 1.0,
177+
"time_s": elapsed,
178+
}
179+
180+
async def _generate_stream(self, messages, max_tokens, temperature, t0):
181+
"""Yield chunks as an async generator."""
182+
loop = asyncio.get_event_loop()
183+
184+
# Disable Qwen3 thinking mode by appending /no_think to last user message
185+
if "qwen3" in self.model_name.lower() and not self._thinking:
186+
messages = list(messages) # shallow copy
187+
if messages and messages[-1].get("role") == "user":
188+
messages[-1] = {**messages[-1], "content": messages[-1]["content"].rstrip() + " /no_think"}
189+
190+
def _stream_sync():
191+
return self._llama.create_chat_completion(
192+
messages=messages,
193+
max_tokens=max_tokens,
194+
temperature=temperature,
195+
stream=True,
196+
)
197+
198+
chunks = await loop.run_in_executor(None, _stream_sync)
199+
token_count = 0
200+
for chunk in chunks:
201+
delta = chunk.get("choices", [{}])[0].get("delta", {})
202+
content = delta.get("content", "")
203+
if content:
204+
token_count += 1
205+
206+
# Streaming loop detection
207+
if self._loop_detector:
208+
loop_result = self._loop_detector.feed(content)
209+
if loop_result and loop_result.get("loop"):
210+
action = loop_result.get("action", "warn")
211+
print(f"[LOOP] Loop detected: {loop_result['line'][:60]}... (action={action})")
212+
if action == "stop":
213+
break
214+
# action="escape" or "warn": continue generating
215+
216+
yield {
217+
"text": content,
218+
"tokens": token_count,
219+
"tok_s": 0,
220+
"ar": 1.0,
221+
"time_s": time.time() - t0,
222+
}

pipeline/hippo_api.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@
3434

3535
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
3636

37+
# Optional llama-cpp-python backend
38+
try:
39+
from backend_llama import LlamaBackend
40+
except ImportError:
41+
LlamaBackend = None
42+
3743
# ─── Config ─────────────────────────────────────────────
3844

3945
DEFAULT_API_CONFIG = {
@@ -204,6 +210,10 @@ def create_backend(cfg: dict) -> HippoBackend:
204210
return DFlashBackend(cfg)
205211
elif mode == "pipeline":
206212
return PipelineBackend(cfg)
213+
elif mode == "llama":
214+
if LlamaBackend is None:
215+
raise ImportError("llama-cpp-python not installed. Run: pip install llama-cpp-python")
216+
return LlamaBackend(cfg)
207217
else:
208218
return StandaloneBackend(cfg)
209219

@@ -383,8 +393,16 @@ def main():
383393
parser.add_argument("--host", default=None)
384394
parser.add_argument("--port", type=int, default=None)
385395
parser.add_argument("--model", default=None)
386-
parser.add_argument("--mode", default=None, choices=["standalone", "pipeline", "dflash"])
396+
parser.add_argument("--mode", default=None, choices=["standalone", "pipeline", "dflash", "llama"])
387397
parser.add_argument("--api-key", default=None, help="Set API key (or comma-separated for multiple)")
398+
parser.add_argument("--gguf-path", default=None, help="Path to GGUF model file (for llama mode)")
399+
parser.add_argument("--n-gpu-layers", type=int, default=-1, help="GPU layers for llama backend (default: -1 = all)")
400+
parser.add_argument("--thinking", action="store_true", default=False, help="Enable Qwen3 thinking mode")
401+
parser.add_argument("--loop-detect", action="store_true", default=False, help="Enable thinking loop detection")
402+
parser.add_argument("--loop-detect-window", type=int, default=20, help="Loop detection window size")
403+
parser.add_argument("--loop-detect-threshold", type=int, default=3, help="Loop detection threshold")
404+
parser.add_argument("--loop-detect-action", default="escape", choices=["escape", "stop", "warn"],
405+
help="Loop detection action")
388406
args = parser.parse_args()
389407

390408
cfg = load_api_config(args.config)
@@ -400,18 +418,29 @@ def main():
400418
cfg["mode"] = args.mode
401419
if args.api_key:
402420
cfg["api_keys"] = [k.strip() for k in args.api_key.split(",")]
421+
if args.gguf_path:
422+
cfg["gguf_path"] = args.gguf_path
423+
if args.n_gpu_layers is not None:
424+
cfg["n_gpu_layers"] = args.n_gpu_layers
425+
cfg["thinking"] = args.thinking
426+
cfg["loop_detect"] = args.loop_detect
427+
cfg["loop_detect_window"] = args.loop_detect_window
428+
cfg["loop_detect_threshold"] = args.loop_detect_threshold
429+
cfg["loop_detect_action"] = args.loop_detect_action
403430

404431
host = cfg.get("host", "0.0.0.0")
405432
port = cfg.get("port", 8080)
406433
mode = cfg.get("mode", "standalone")
407434
model = cfg.get("model", "qwen3-4b")
408435
auth_status = "enabled" if cfg.get("api_keys") else "disabled"
436+
loop_status = f"enabled (action={cfg.get('loop_detect_action', 'escape')})" if cfg.get("loop_detect") else "disabled"
409437

410438
print("🦛 Hippo API Server")
411439
print(f" Host: {host}:{port}")
412440
print(f" Model: {model}")
413441
print(f" Mode: {mode}")
414442
print(f" Auth: {auth_status}")
443+
print(f" Loop detect: {loop_status}")
415444
print(" Endpoints:")
416445
print(" POST /v1/chat/completions")
417446
print(" GET /v1/models")

0 commit comments

Comments
 (0)