Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Commit bbb5e94

Browse files
committed
Update llama.cpp related code to use openai format
1 parent 909d9f1 commit bbb5e94

File tree

6 files changed

+64
-33
lines changed

6 files changed

+64
-33
lines changed

src/codegate/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class Config:
2828
log_format: LogFormat = LogFormat.JSON
2929
prompts: PromptConfig = field(default_factory=PromptConfig)
3030

31-
chat_model_path: str = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf"
31+
model_base_path: str = "./models"
3232
chat_model_n_ctx: int = 32768
3333
chat_model_n_gpu_layers: int = -1
3434

@@ -102,7 +102,7 @@ def from_file(cls, config_path: Union[str, Path]) -> "Config":
102102
host=config_data.get("host", cls.host),
103103
log_level=config_data.get("log_level", cls.log_level.value),
104104
log_format=config_data.get("log_format", cls.log_format.value),
105-
chat_model_path=config_data.get("chat_model_path", cls.chat_model_path),
105+
model_base_path=config_data.get("chat_model_path", cls.model_base_path),
106106
chat_model_n_ctx=config_data.get(
107107
"chat_model_n_ctx", cls.chat_model_n_ctx
108108
),

src/codegate/inference/inference_engine.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,23 @@ async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers
4646

4747
return self.__models[model_path]
4848

49+
async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_request):
50+
"""
51+
Generates a chat completion using the specified model and request parameters.
52+
"""
53+
model = await self.__get_model(
54+
model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
55+
)
56+
return model.create_completion(**completion_request)
57+
4958
async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_request):
5059
"""
5160
Generates a chat completion using the specified model and request parameters.
5261
"""
5362
model = await self.__get_model(
5463
model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
5564
)
56-
return model.create_completion(**chat_completion_request)
65+
return model.create_chat_completion(**chat_completion_request)
5766

5867
async def embed(self, model_path, content):
5968
"""

src/codegate/providers/litellmshim/generators.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ async def llamacpp_stream_generator(stream: Iterator[Any]) -> AsyncIterator[str]
4747
if hasattr(chunk, "model_dump_json"):
4848
chunk = chunk.model_dump_json(exclude_none=True, exclude_unset=True)
4949
try:
50-
chunk["content"] = chunk["choices"][0]["text"]
5150
yield f"data:{json.dumps(chunk)}\n\n"
5251
await asyncio.sleep(0)
5352
except Exception as e:

src/codegate/providers/llamacpp/completion_handler.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@ def translate_request(self, data: Dict, api_key: str) -> ChatCompletionRequest:
2020
if completion_request is None:
2121
raise Exception("Couldn't translate the request")
2222

23-
# Replace n_predict option with max_tokens
24-
if 'n_predict' in completion_request:
25-
completion_request['max_tokens'] = completion_request['n_predict']
26-
del completion_request['n_predict']
2723
return ChatCompletionRequest(**completion_request)
2824

2925
def translate_streaming_response(
@@ -50,12 +46,20 @@ async def execute_completion(
5046
stream: bool = False
5147
) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
5248
"""
53-
Execute the completion request with LiteLLM's API
49+
Execute the completion request with inference engine API
5450
"""
55-
response = await self.inference_engine.chat(Config.get_config().chat_model_path,
56-
Config.get_config().chat_model_n_ctx,
57-
Config.get_config().chat_model_n_gpu_layers,
58-
**request)
51+
model_path = f"{Config.get_config().model_base_path}/{request['model']}.gguf"
52+
53+
if 'prompt' in request:
54+
response = await self.inference_engine.complete(model_path,
55+
Config.get_config().chat_model_n_ctx,
56+
Config.get_config().chat_model_n_gpu_layers,
57+
**request)
58+
else:
59+
response = await self.inference_engine.chat(model_path,
60+
Config.get_config().chat_model_n_ctx,
61+
Config.get_config().chat_model_n_gpu_layers,
62+
**request)
5963
return response
6064

6165
def create_streaming_response(

src/codegate/providers/llamacpp/provider.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ def provider_route_name(self) -> str:
1919

2020
def _setup_routes(self):
2121
"""
22-
Sets up the /chat route for the provider as expected by the
23-
Llama API. Extracts the API key from the "Authorization" header and
24-
passes it to the completion handler.
22+
Sets up the /completions and /chat/completions routes for the
23+
provider as expected by the Llama API.
2524
"""
26-
@self.router.post(f"/{self.provider_route_name}/completion")
25+
@self.router.post(f"/{self.provider_route_name}/completions")
26+
@self.router.post(f"/{self.provider_route_name}/chat/completions")
2727
async def create_completion(
2828
request: Request,
2929
):

tests/test_inference.py

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,57 @@
11
import pytest
22

3-
# @pytest.mark.asyncio
4-
# async def test_generate(inference_engine) -> None:
5-
# """Test code generation."""
63

7-
# prompt = '''
8-
# import requests
4+
@pytest.mark.asyncio
5+
async def test_generate(inference_engine) -> None:
6+
"""Test code generation."""
97

10-
# # Function to call API over http
11-
# def call_api(url):
12-
# '''
13-
# model_path = "./models/qwen2.5-coder-1.5B.q5_k_m.gguf"
8+
completion_request = {
9+
"model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
10+
"max_tokens": 4096,
11+
"temperature": 0,
12+
"stream": True,
13+
"stop": [
14+
"<|endoftext|>",
15+
"<|fim_prefix|>",
16+
"<|fim_middle|>",
17+
"<|fim_suffix|>",
18+
"<|fim_pad|>",
19+
"<|repo_name|>",
20+
"<|file_sep|>",
21+
"<|im_start|>",
22+
"<|im_end|>",
23+
"/src/",
24+
"#- coding: utf-8",
25+
"```",
26+
],
27+
"prompt": "<|fim_prefix|>\\n# codegate/test.py\\nimport requests\\n\\ndef call_api(url):\\n"
28+
" <|fim_suffix|>\\n\\n\\n\\nresponse = call_api('http://localhost/test')"
29+
"\\nprint(response)<|fim_middle|>",
30+
}
31+
model_path = f"./models/{completion_request['model']}.gguf"
32+
response = await inference_engine.complete(model_path, **completion_request)
1433

15-
# async for chunk in inference_engine.generate(model_path, prompt):
16-
# print(chunk)
34+
for chunk in response:
35+
assert chunk["choices"][0]["text"] is not None
1736

1837

1938
@pytest.mark.asyncio
2039
async def test_chat(inference_engine) -> None:
2140
"""Test chat completion."""
2241

2342
chat_request = {
24-
"prompt": "<|im_start|>user\\nhello<|im_end|>\\n<|im_start|>assistant\\n",
25-
"stream": True,
43+
"messages": [{"role": "user", "content": "hello"}],
44+
"model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
2645
"max_tokens": 4096,
27-
"top_k": 50,
2846
"temperature": 0,
47+
"stream": True,
2948
}
3049

31-
model_path = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf"
50+
model_path = f"./models/{chat_request['model']}.gguf"
3251
response = await inference_engine.chat(model_path, **chat_request)
3352

3453
for chunk in response:
35-
assert chunk["choices"][0]["text"] is not None
54+
assert 'delta' in chunk["choices"][0]
3655

3756

3857
@pytest.mark.asyncio

0 commit comments

Comments
 (0)