Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Update llama.cpp related code to use openai format #107

Merged
merged 1 commit into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/codegate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class Config:
log_format: LogFormat = LogFormat.JSON
prompts: PromptConfig = field(default_factory=PromptConfig)

chat_model_path: str = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf"
model_base_path: str = "./models"
chat_model_n_ctx: int = 32768
chat_model_n_gpu_layers: int = -1

Expand Down Expand Up @@ -102,7 +102,7 @@ def from_file(cls, config_path: Union[str, Path]) -> "Config":
host=config_data.get("host", cls.host),
log_level=config_data.get("log_level", cls.log_level.value),
log_format=config_data.get("log_format", cls.log_format.value),
chat_model_path=config_data.get("chat_model_path", cls.chat_model_path),
model_base_path=config_data.get("chat_model_path", cls.model_base_path),
chat_model_n_ctx=config_data.get(
"chat_model_n_ctx", cls.chat_model_n_ctx
),
Expand Down
11 changes: 10 additions & 1 deletion src/codegate/inference/inference_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,23 @@ async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers

return self.__models[model_path]

async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_request):
"""
Generates a chat completion using the specified model and request parameters.
"""
model = await self.__get_model(
model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
)
return model.create_completion(**completion_request)

async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_request):
"""
Generates a chat completion using the specified model and request parameters.
"""
model = await self.__get_model(
model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
)
return model.create_completion(**chat_completion_request)
return model.create_chat_completion(**chat_completion_request)

async def embed(self, model_path, content):
"""
Expand Down
1 change: 0 additions & 1 deletion src/codegate/providers/litellmshim/generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ async def llamacpp_stream_generator(stream: Iterator[Any]) -> AsyncIterator[str]
if hasattr(chunk, "model_dump_json"):
chunk = chunk.model_dump_json(exclude_none=True, exclude_unset=True)
try:
chunk["content"] = chunk["choices"][0]["text"]
yield f"data:{json.dumps(chunk)}\n\n"
await asyncio.sleep(0)
except Exception as e:
Expand Down
22 changes: 13 additions & 9 deletions src/codegate/providers/llamacpp/completion_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ def translate_request(self, data: Dict, api_key: str) -> ChatCompletionRequest:
if completion_request is None:
raise Exception("Couldn't translate the request")

# Replace n_predict option with max_tokens
if 'n_predict' in completion_request:
completion_request['max_tokens'] = completion_request['n_predict']
del completion_request['n_predict']
return ChatCompletionRequest(**completion_request)

def translate_streaming_response(
Expand All @@ -50,12 +46,20 @@ async def execute_completion(
stream: bool = False
) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
"""
Execute the completion request with LiteLLM's API
Execute the completion request with inference engine API
"""
response = await self.inference_engine.chat(Config.get_config().chat_model_path,
Config.get_config().chat_model_n_ctx,
Config.get_config().chat_model_n_gpu_layers,
**request)
model_path = f"{Config.get_config().model_base_path}/{request['model']}.gguf"

if 'prompt' in request:
response = await self.inference_engine.complete(model_path,
Config.get_config().chat_model_n_ctx,
Config.get_config().chat_model_n_gpu_layers,
**request)
else:
response = await self.inference_engine.chat(model_path,
Config.get_config().chat_model_n_ctx,
Config.get_config().chat_model_n_gpu_layers,
**request)
return response

def create_streaming_response(
Expand Down
8 changes: 4 additions & 4 deletions src/codegate/providers/llamacpp/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ def provider_route_name(self) -> str:

def _setup_routes(self):
"""
Sets up the /chat route for the provider as expected by the
Llama API. Extracts the API key from the "Authorization" header and
passes it to the completion handler.
Sets up the /completions and /chat/completions routes for the
provider as expected by the Llama API.
"""
@self.router.post(f"/{self.provider_route_name}/completion")
@self.router.post(f"/{self.provider_route_name}/completions")
@self.router.post(f"/{self.provider_route_name}/chat/completions")
async def create_completion(
request: Request,
):
Expand Down
51 changes: 35 additions & 16 deletions tests/test_inference.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,57 @@
import pytest

# @pytest.mark.asyncio
# async def test_generate(inference_engine) -> None:
# """Test code generation."""

# prompt = '''
# import requests
@pytest.mark.asyncio
async def test_generate(inference_engine) -> None:
"""Test code generation."""

# # Function to call API over http
# def call_api(url):
# '''
# model_path = "./models/qwen2.5-coder-1.5B.q5_k_m.gguf"
completion_request = {
"model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
"max_tokens": 4096,
"temperature": 0,
"stream": True,
"stop": [
"<|endoftext|>",
"<|fim_prefix|>",
"<|fim_middle|>",
"<|fim_suffix|>",
"<|fim_pad|>",
"<|repo_name|>",
"<|file_sep|>",
"<|im_start|>",
"<|im_end|>",
"/src/",
"#- coding: utf-8",
"```",
],
"prompt": "<|fim_prefix|>\\n# codegate/test.py\\nimport requests\\n\\ndef call_api(url):\\n"
" <|fim_suffix|>\\n\\n\\n\\nresponse = call_api('http://localhost/test')"
"\\nprint(response)<|fim_middle|>",
}
model_path = f"./models/{completion_request['model']}.gguf"
response = await inference_engine.complete(model_path, **completion_request)

# async for chunk in inference_engine.generate(model_path, prompt):
# print(chunk)
for chunk in response:
assert chunk["choices"][0]["text"] is not None


@pytest.mark.asyncio
async def test_chat(inference_engine) -> None:
"""Test chat completion."""

chat_request = {
"prompt": "<|im_start|>user\\nhello<|im_end|>\\n<|im_start|>assistant\\n",
"stream": True,
"messages": [{"role": "user", "content": "hello"}],
"model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
"max_tokens": 4096,
"top_k": 50,
"temperature": 0,
"stream": True,
}

model_path = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf"
model_path = f"./models/{chat_request['model']}.gguf"
response = await inference_engine.chat(model_path, **chat_request)

for chunk in response:
assert chunk["choices"][0]["text"] is not None
assert 'delta' in chunk["choices"][0]


@pytest.mark.asyncio
Expand Down