diff --git a/src/codegate/config.py b/src/codegate/config.py index f019da73..3d39134c 100644 --- a/src/codegate/config.py +++ b/src/codegate/config.py @@ -28,7 +28,7 @@ class Config: log_format: LogFormat = LogFormat.JSON prompts: PromptConfig = field(default_factory=PromptConfig) - chat_model_path: str = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf" + model_base_path: str = "./models" chat_model_n_ctx: int = 32768 chat_model_n_gpu_layers: int = -1 @@ -102,7 +102,7 @@ def from_file(cls, config_path: Union[str, Path]) -> "Config": host=config_data.get("host", cls.host), log_level=config_data.get("log_level", cls.log_level.value), log_format=config_data.get("log_format", cls.log_format.value), - chat_model_path=config_data.get("chat_model_path", cls.chat_model_path), + model_base_path=config_data.get("chat_model_path", cls.model_base_path), chat_model_n_ctx=config_data.get( "chat_model_n_ctx", cls.chat_model_n_ctx ), diff --git a/src/codegate/inference/inference_engine.py b/src/codegate/inference/inference_engine.py index 9bf4d103..74d2808c 100644 --- a/src/codegate/inference/inference_engine.py +++ b/src/codegate/inference/inference_engine.py @@ -46,6 +46,15 @@ async def __get_model(self, model_path, embedding=False, n_ctx=512, n_gpu_layers return self.__models[model_path] + async def complete(self, model_path, n_ctx=512, n_gpu_layers=0, **completion_request): + """ + Generates a chat completion using the specified model and request parameters. + """ + model = await self.__get_model( + model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers + ) + return model.create_completion(**completion_request) + async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_request): """ Generates a chat completion using the specified model and request parameters. @@ -53,7 +62,7 @@ async def chat(self, model_path, n_ctx=512, n_gpu_layers=0, **chat_completion_re model = await self.__get_model( model_path=model_path, n_ctx=n_ctx, n_gpu_layers=n_gpu_layers ) - return model.create_completion(**chat_completion_request) + return model.create_chat_completion(**chat_completion_request) async def embed(self, model_path, content): """ diff --git a/src/codegate/providers/litellmshim/generators.py b/src/codegate/providers/litellmshim/generators.py index 2ec41ec7..c9ad8fc8 100644 --- a/src/codegate/providers/litellmshim/generators.py +++ b/src/codegate/providers/litellmshim/generators.py @@ -47,7 +47,6 @@ async def llamacpp_stream_generator(stream: Iterator[Any]) -> AsyncIterator[str] if hasattr(chunk, "model_dump_json"): chunk = chunk.model_dump_json(exclude_none=True, exclude_unset=True) try: - chunk["content"] = chunk["choices"][0]["text"] yield f"data:{json.dumps(chunk)}\n\n" await asyncio.sleep(0) except Exception as e: diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py index 9c0a5744..822947eb 100644 --- a/src/codegate/providers/llamacpp/completion_handler.py +++ b/src/codegate/providers/llamacpp/completion_handler.py @@ -20,10 +20,6 @@ def translate_request(self, data: Dict, api_key: str) -> ChatCompletionRequest: if completion_request is None: raise Exception("Couldn't translate the request") - # Replace n_predict option with max_tokens - if 'n_predict' in completion_request: - completion_request['max_tokens'] = completion_request['n_predict'] - del completion_request['n_predict'] return ChatCompletionRequest(**completion_request) def translate_streaming_response( @@ -50,12 +46,20 @@ async def execute_completion( stream: bool = False ) -> Union[ModelResponse, AsyncIterator[ModelResponse]]: """ - Execute the completion request with LiteLLM's API + Execute the completion request with inference engine API """ - response = await self.inference_engine.chat(Config.get_config().chat_model_path, - Config.get_config().chat_model_n_ctx, - Config.get_config().chat_model_n_gpu_layers, - **request) + model_path = f"{Config.get_config().model_base_path}/{request['model']}.gguf" + + if 'prompt' in request: + response = await self.inference_engine.complete(model_path, + Config.get_config().chat_model_n_ctx, + Config.get_config().chat_model_n_gpu_layers, + **request) + else: + response = await self.inference_engine.chat(model_path, + Config.get_config().chat_model_n_ctx, + Config.get_config().chat_model_n_gpu_layers, + **request) return response def create_streaming_response( diff --git a/src/codegate/providers/llamacpp/provider.py b/src/codegate/providers/llamacpp/provider.py index a0e349e5..a3227085 100644 --- a/src/codegate/providers/llamacpp/provider.py +++ b/src/codegate/providers/llamacpp/provider.py @@ -19,11 +19,11 @@ def provider_route_name(self) -> str: def _setup_routes(self): """ - Sets up the /chat route for the provider as expected by the - Llama API. Extracts the API key from the "Authorization" header and - passes it to the completion handler. + Sets up the /completions and /chat/completions routes for the + provider as expected by the Llama API. """ - @self.router.post(f"/{self.provider_route_name}/completion") + @self.router.post(f"/{self.provider_route_name}/completions") + @self.router.post(f"/{self.provider_route_name}/chat/completions") async def create_completion( request: Request, ): diff --git a/tests/test_inference.py b/tests/test_inference.py index fe562c88..86c2eeea 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -1,19 +1,38 @@ import pytest -# @pytest.mark.asyncio -# async def test_generate(inference_engine) -> None: -# """Test code generation.""" -# prompt = ''' -# import requests +@pytest.mark.asyncio +async def test_generate(inference_engine) -> None: + """Test code generation.""" -# # Function to call API over http -# def call_api(url): -# ''' -# model_path = "./models/qwen2.5-coder-1.5B.q5_k_m.gguf" + completion_request = { + "model": "qwen2.5-coder-1.5b-instruct-q5_k_m", + "max_tokens": 4096, + "temperature": 0, + "stream": True, + "stop": [ + "<|endoftext|>", + "<|fim_prefix|>", + "<|fim_middle|>", + "<|fim_suffix|>", + "<|fim_pad|>", + "<|repo_name|>", + "<|file_sep|>", + "<|im_start|>", + "<|im_end|>", + "/src/", + "#- coding: utf-8", + "```", + ], + "prompt": "<|fim_prefix|>\\n# codegate/test.py\\nimport requests\\n\\ndef call_api(url):\\n" + " <|fim_suffix|>\\n\\n\\n\\nresponse = call_api('http://localhost/test')" + "\\nprint(response)<|fim_middle|>", + } + model_path = f"./models/{completion_request['model']}.gguf" + response = await inference_engine.complete(model_path, **completion_request) -# async for chunk in inference_engine.generate(model_path, prompt): -# print(chunk) + for chunk in response: + assert chunk["choices"][0]["text"] is not None @pytest.mark.asyncio @@ -21,18 +40,18 @@ async def test_chat(inference_engine) -> None: """Test chat completion.""" chat_request = { - "prompt": "<|im_start|>user\\nhello<|im_end|>\\n<|im_start|>assistant\\n", - "stream": True, + "messages": [{"role": "user", "content": "hello"}], + "model": "qwen2.5-coder-1.5b-instruct-q5_k_m", "max_tokens": 4096, - "top_k": 50, "temperature": 0, + "stream": True, } - model_path = "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf" + model_path = f"./models/{chat_request['model']}.gguf" response = await inference_engine.chat(model_path, **chat_request) for chunk in response: - assert chunk["choices"][0]["text"] is not None + assert 'delta' in chunk["choices"][0] @pytest.mark.asyncio