Update llama.cpp related code to use openai format

ptelang · ptelang · commit f571a424d5ba · 2024-11-27T14:58:26.000-05:00
diff --git a/src/codegate/config.py b/src/codegate/config.py
@@ -28,7 +28,7 @@ class Config:
     log_format: LogFormat = LogFormat.JSON
     prompts: PromptConfig = field(default_factory=PromptConfig)
 
-    chat_model_path: str = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf"
+    model_base_path: str = "./models"
     chat_model_n_ctx: int = 32768
     chat_model_n_gpu_layers: int = -1
 
@@ -102,7 +102,7 @@ def from_file(cls, config_path: Union[str, Path]) -> "Config":
                 host=config_data.get("host", cls.host),
                 log_level=config_data.get("log_level", cls.log_level.value),
                 log_format=config_data.get("log_format", cls.log_format.value),
-                chat_model_path=config_data.get("chat_model_path", cls.chat_model_path),
+                model_base_path=config_data.get("chat_model_path", cls.model_base_path),
                 chat_model_n_ctx=config_data.get(
                     "chat_model_n_ctx", cls.chat_model_n_ctx
                 ),
diff --git a/src/codegate/inference/inference_engine.py b/src/codegate/inference/inference_engine.py
@@ -46,14 +46,23 @@ async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers
 
         return self.__models[model_path]
 
+    async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_request):
+        """
+        Generates a chat completion using the specified model and request parameters.
+        """
+        model = await self.__get_model(
+            model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
+        )
+        return model.create_completion(**completion_request)
+
     async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_request):
         """
         Generates a chat completion using the specified model and request parameters.
         """
         model = await self.__get_model(
             model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers
         )
-        return model.create_completion(**chat_completion_request)
+        return model.create_chat_completion(**chat_completion_request)
 
     async def embed(self, model_path, content):
         """
diff --git a/src/codegate/providers/litellmshim/generators.py b/src/codegate/providers/litellmshim/generators.py
@@ -47,7 +47,6 @@ async def llamacpp_stream_generator(stream: Iterator[Any]) -> AsyncIterator[str]
             if hasattr(chunk, "model_dump_json"):
                 chunk = chunk.model_dump_json(exclude_none=True, exclude_unset=True)
             try:
-                chunk["content"] = chunk["choices"][0]["text"]
                 yield f"data:{json.dumps(chunk)}\n\n"
                 await asyncio.sleep(0)
             except Exception as e:
diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py
@@ -20,10 +20,6 @@ def translate_request(self, data: Dict, api_key: str) -> ChatCompletionRequest:
         if completion_request is None:
             raise Exception("Couldn't translate the request")
 
-        # Replace n_predict option with max_tokens
-        if 'n_predict' in completion_request:
-            completion_request['max_tokens'] = completion_request['n_predict']
-            del completion_request['n_predict']
         return ChatCompletionRequest(**completion_request)
 
     def translate_streaming_response(
@@ -50,12 +46,20 @@ async def execute_completion(
             stream: bool = False
     ) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
         """
-        Execute the completion request with LiteLLM's API
+        Execute the completion request with inference engine API
         """
-        response = await self.inference_engine.chat(Config.get_config().chat_model_path,
-                                                    Config.get_config().chat_model_n_ctx,
-                                                    Config.get_config().chat_model_n_gpu_layers,
-                                                    **request)
+        model_path = f'{Config.get_config().model_base_path}/{request['model']}.gguf'
+
+        if 'prompt' in request:
+            response = await self.inference_engine.complete(model_path,
+                                                        Config.get_config().chat_model_n_ctx,
+                                                        Config.get_config().chat_model_n_gpu_layers,
+                                                        **request)
+        else:
+            response = await self.inference_engine.chat(model_path,
+                                                        Config.get_config().chat_model_n_ctx,
+                                                        Config.get_config().chat_model_n_gpu_layers,
+                                                        **request)
         return response
 
     def create_streaming_response(
diff --git a/src/codegate/providers/llamacpp/provider.py b/src/codegate/providers/llamacpp/provider.py
@@ -19,11 +19,11 @@ def provider_route_name(self) -> str:
 
     def _setup_routes(self):
         """
-        Sets up the /chat route for the provider as expected by the
-        Llama API. Extracts the API key from the "Authorization" header and
-        passes it to the completion handler.
+        Sets up the /completions and /chat/completions routes for the
+        provider as expected by the Llama API.
         """
-        @self.router.post(f"/{self.provider_route_name}/completion")
+        @self.router.post(f"/{self.provider_route_name}/completions")
+        @self.router.post(f"/{self.provider_route_name}/chat/completions")
         async def create_completion(
             request: Request,
         ):
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -1,38 +1,57 @@
 import pytest
 
-# @pytest.mark.asyncio
-# async def test_generate(inference_engine) -> None:
-#     """Test code generation."""
 
-#     prompt = '''
-#         import requests
+@pytest.mark.asyncio
+async def test_generate(inference_engine) -> None:
+    """Test code generation."""
 
-#         # Function to call API over http
-#         def call_api(url):
-#     '''
-#     model_path = "./models/qwen2.5-coder-1.5B.q5_k_m.gguf"
+    completion_request = {
+        "model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
+        "max_tokens": 4096,
+        "temperature": 0,
+        "stream": True,
+        "stop": [
+            "<|endoftext|>",
+            "<|fim_prefix|>",
+            "<|fim_middle|>",
+            "<|fim_suffix|>",
+            "<|fim_pad|>",
+            "<|repo_name|>",
+            "<|file_sep|>",
+            "<|im_start|>",
+            "<|im_end|>",
+            "/src/",
+            "#- coding: utf-8",
+            "```",
+        ],
+        "prompt": "<|fim_prefix|>\\n# codegate/test.py\\nimport requests\\n\\ndef call_api(url):\\n"
+        "    <|fim_suffix|>\\n\\n\\n\\nresponse = call_api('http://localhost/test')"
+        "\\nprint(response)<|fim_middle|>",
+    }
+    model_path = f'./models/{completion_request['model']}.gguf'
+    response = await inference_engine.complete(model_path, **completion_request)
 
-#     async for chunk in inference_engine.generate(model_path, prompt):
-#         print(chunk)
+    for chunk in response:
+        assert chunk["choices"][0]["text"] is not None
 
 
 @pytest.mark.asyncio
 async def test_chat(inference_engine) -> None:
     """Test chat completion."""
 
     chat_request = {
-        "prompt": "<|im_start|>user\\nhello<|im_end|>\\n<|im_start|>assistant\\n",
-        "stream": True,
+        "messages": [{"role": "user", "content": "hello"}],
+        "model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
         "max_tokens": 4096,
-        "top_k": 50,
         "temperature": 0,
+        "stream": True,
     }
 
-    model_path = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf"
+    model_path = f'./models/{chat_request['model']}.gguf'
     response = await inference_engine.chat(model_path, **chat_request)
 
     for chunk in response:
-        assert chunk["choices"][0]["text"] is not None
+        assert 'delta' in chunk["choices"][0]
 
 
 @pytest.mark.asyncio