Merge pull request #152 from stacklok/add-ollama-to-pipeline

aponcedeleonch · web-flow · commit 1179eb49acd2 · 2024-12-02T15:55:30.000+02:00
Use pipelines in Ollama provider
diff --git a/src/codegate/cli.py b/src/codegate/cli.py
@@ -107,6 +107,12 @@ def show_prompts(prompts: Optional[Path]) -> None:
     default=None,
     help="Anthropic provider URL (default: https://api.anthropic.com/v1)",
 )
+@click.option(
+    "--ollama-url",
+    type=str,
+    default=None,
+    help="Ollama provider URL (default: http://localhost:11434/api)",
+)
 def serve(
     port: Optional[int],
     host: Optional[str],
@@ -117,6 +123,7 @@ def serve(
     vllm_url: Optional[str],
     openai_url: Optional[str],
     anthropic_url: Optional[str],
+    ollama_url: Optional[str],
 ) -> None:
     """Start the codegate server."""
     logger = None
@@ -129,6 +136,8 @@ def serve(
             cli_provider_urls["openai"] = openai_url
         if anthropic_url:
             cli_provider_urls["anthropic"] = anthropic_url
+        if ollama_url:
+            cli_provider_urls["ollama"] = ollama_url
 
         # Load configuration with priority resolution
         cfg = Config.load(
diff --git a/src/codegate/config.py b/src/codegate/config.py
@@ -19,7 +19,7 @@
     "openai": "https://api.openai.com/v1",
     "anthropic": "https://api.anthropic.com/v1",
     "vllm": "http://localhost:8000",  # Base URL without /v1 path
-    "ollama": "http://localhost:11434",  # Default Ollama server URL
+    "ollama": "http://localhost:11434/api",  # Default Ollama server URL
 }
 
 
diff --git a/src/codegate/providers/base.py b/src/codegate/providers/base.py
@@ -95,7 +95,8 @@ def _is_fim_request_url(self, request: Request) -> bool:
         if request_path.endswith("/chat/completions"):
             return False
 
-        if request_path.endswith("/completions"):
+        # /completions is for OpenAI standard. /api/generate is for ollama.
+        if request_path.endswith("/completions") or request_path.endswith("/api/generate"):
             return True
 
         return False
diff --git a/src/codegate/providers/completion/base.py b/src/codegate/providers/completion/base.py
@@ -1,3 +1,4 @@
+import inspect
 from abc import ABC, abstractmethod
 from collections.abc import Iterator
 from typing import Any, AsyncIterator, Optional, Union
@@ -35,6 +36,6 @@ def create_response(self, response: Any) -> Union[JSONResponse, StreamingRespons
         """
         Create a FastAPI response from the completion response.
         """
-        if isinstance(response, Iterator):
+        if isinstance(response, Iterator) or inspect.isasyncgen(response):
             return self._create_streaming_response(response)
         return self._create_json_response(response)
diff --git a/src/codegate/providers/ollama/adapter.py b/src/codegate/providers/ollama/adapter.py
@@ -15,25 +15,34 @@ def normalize(self, data: Dict) -> ChatCompletionRequest:
         """
         # Make a copy of the data to avoid modifying the original
         normalized_data = data.copy()
+        normalized_data["options"] = data.get("options", {})
+
+        # Add any context or system prompt if provided
+        if "context" in data:
+            normalized_data["context"] = data["context"]
+        if "system" in data:
+            normalized_data["system"] = data["system"]
 
         # Format the model name
         if "model" in normalized_data:
-            normalized_data["model"] = normalized_data["model"].strip()
+            normalized_data["model"] = data["model"].strip()
 
         # Convert messages format if needed
-        if "messages" in normalized_data:
-            messages = normalized_data["messages"]
+        if "messages" in data:
+            messages = data["messages"]
             converted_messages = []
             for msg in messages:
-                if isinstance(msg.get("content"), list):
+                role = msg.get("role", "")
+                content = msg.get("content", "")
+                new_msg = {"role": role, "content": content}
+                if isinstance(content, list):
                     # Convert list format to string
                     content_parts = []
                     for part in msg["content"]:
                         if part.get("type") == "text":
                             content_parts.append(part["text"])
-                    msg = msg.copy()
-                    msg["content"] = " ".join(content_parts)
-                converted_messages.append(msg)
+                    new_msg["content"] = " ".join(content_parts)
+                converted_messages.append(new_msg)
             normalized_data["messages"] = converted_messages
 
         # Ensure the base_url ends with /api if provided
diff --git a/src/codegate/providers/ollama/completion_handler.py b/src/codegate/providers/ollama/completion_handler.py
@@ -0,0 +1,74 @@
+import asyncio
+import json
+from typing import Any, AsyncIterator, Optional
+
+import httpx
+import structlog
+from fastapi.responses import JSONResponse, StreamingResponse
+from litellm import ChatCompletionRequest
+
+from codegate.providers.base import BaseCompletionHandler
+
+logger = structlog.get_logger("codegate")
+
+
+async def get_async_ollama_response(client, request_url, data):
+    try:
+        async with client.stream("POST", request_url, json=data, timeout=30.0) as response:
+            response.raise_for_status()
+            async for line in response.aiter_lines():
+                if line.strip():
+                    try:
+                        # Parse the response to ensure it's valid JSON
+                        response_data = json.loads(line)
+                        # Add newline to ensure proper streaming
+                        yield line.encode("utf-8") + b"\n"
+                        # If this is the final response, break
+                        if response_data.get("done", False):
+                            break
+                        # Small delay to prevent overwhelming the client
+                        await asyncio.sleep(0.01)
+                    except json.JSONDecodeError:
+                        yield json.dumps({"error": "Invalid JSON response"}).encode("utf-8") + b"\n"
+                        break
+                    except Exception as e:
+                        yield json.dumps({"error": str(e)}).encode("utf-8") + b"\n"
+                        break
+    except Exception as e:
+        yield json.dumps({"error": f"Stream error: {str(e)}"}).encode("utf-8") + b"\n"
+
+
+class OllamaCompletionHandler(BaseCompletionHandler):
+    def __init__(self):
+        self.client = httpx.AsyncClient(timeout=30.0)
+        # Depends if the request is Chat or FIM
+        self._url_mapping = {False: "/chat", True: "/generate"}
+
+    async def execute_completion(
+        self,
+        request: ChatCompletionRequest,
+        api_key: Optional[str],
+        stream: bool = False,
+        is_fim_request: bool = False,
+    ) -> AsyncIterator:
+        """Stream response directly from Ollama API."""
+        request_path = self._url_mapping[is_fim_request]
+        request_url = f"{request['base_url']}{request_path}"
+        return get_async_ollama_response(self.client, request_url, request)
+
+    def _create_streaming_response(self, stream: AsyncIterator[Any]) -> StreamingResponse:
+        """
+        Create a streaming response from a stream generator. The StreamingResponse
+        is the format that FastAPI expects for streaming responses.
+        """
+        return StreamingResponse(
+            stream,
+            media_type="application/x-ndjson",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+            },
+        )
+
+    def _create_json_response(self, response: Any) -> JSONResponse:
+        raise NotImplementedError("JSON Reponse in Ollama not implemented yet.")
diff --git a/src/codegate/providers/ollama/provider.py b/src/codegate/providers/ollama/provider.py
@@ -1,42 +1,12 @@
-import asyncio
 import json
 from typing import Optional
 
-import httpx
-from fastapi import Header, HTTPException, Request
-from fastapi.responses import StreamingResponse
+from fastapi import Request
 
 from codegate.config import Config
 from codegate.providers.base import BaseProvider, SequentialPipelineProcessor
-from codegate.providers.litellmshim import LiteLLmShim, sse_stream_generator
 from codegate.providers.ollama.adapter import OllamaInputNormalizer, OllamaOutputNormalizer
-
-
-async def stream_ollama_response(client: httpx.AsyncClient, url: str, data: dict):
-    """Stream response directly from Ollama API."""
-    try:
-        async with client.stream("POST", url, json=data, timeout=30.0) as response:
-            response.raise_for_status()
-            async for line in response.aiter_lines():
-                if line.strip():
-                    try:
-                        # Parse the response to ensure it's valid JSON
-                        response_data = json.loads(line)
-                        # Add newline to ensure proper streaming
-                        yield line.encode("utf-8") + b"\n"
-                        # If this is the final response, break
-                        if response_data.get("done", False):
-                            break
-                        # Small delay to prevent overwhelming the client
-                        await asyncio.sleep(0.01)
-                    except json.JSONDecodeError:
-                        yield json.dumps({"error": "Invalid JSON response"}).encode("utf-8") + b"\n"
-                        break
-                    except Exception as e:
-                        yield json.dumps({"error": str(e)}).encode("utf-8") + b"\n"
-                        break
-    except Exception as e:
-        yield json.dumps({"error": f"Stream error: {str(e)}"}).encode("utf-8") + b"\n"
+from codegate.providers.ollama.completion_handler import OllamaCompletionHandler
 
 
 class OllamaProvider(BaseProvider):
@@ -45,15 +15,21 @@ def __init__(
         pipeline_processor: Optional[SequentialPipelineProcessor] = None,
         fim_pipeline_processor: Optional[SequentialPipelineProcessor] = None,
     ):
-        completion_handler = LiteLLmShim(stream_generator=sse_stream_generator)
+        completion_handler = OllamaCompletionHandler()
         super().__init__(
             OllamaInputNormalizer(),
             OllamaOutputNormalizer(),
             completion_handler,
             pipeline_processor,
             fim_pipeline_processor,
         )
-        self.client = httpx.AsyncClient(timeout=30.0)
+        # Get the Ollama base URL
+        config = Config.get_config()
+        if config is None:
+            provided_urls = {}
+        else:
+            provided_urls = config.provider_urls
+        self.base_url = provided_urls.get("ollama", "http://localhost:11434/api")
 
     @property
     def provider_route_name(self) -> str:
@@ -66,96 +42,16 @@ def _setup_routes(self):
 
         # Native Ollama API routes
         @self.router.post(f"/{self.provider_route_name}/api/chat")
-        async def ollama_chat(
-            request: Request,
-            authorization: str = Header(..., description="Bearer token"),
-        ):
-            if not authorization.startswith("Bearer "):
-                raise HTTPException(status_code=401, detail="Invalid authorization header")
-
-            _api_key = authorization.split(" ")[1]
-            body = await request.body()
-            data = json.loads(body)
-
-            # Get the Ollama base URL
-            config = Config.get_config()
-            base_url = config.provider_urls.get("ollama", "http://localhost:11434")
-
-            # Convert chat format to Ollama generate format
-            messages = []
-            for msg in data.get("messages", []):
-                role = msg.get("role", "")
-                content = msg.get("content", "")
-                if isinstance(content, list):
-                    # Handle list-based content format
-                    content = " ".join(
-                        part["text"] for part in content if part.get("type") == "text"
-                    )
-                messages.append({"role": role, "content": content})
-
-            ollama_data = {
-                "model": data.get("model", "").strip(),
-                "messages": messages,
-                "stream": True,
-                "options": data.get("options", {}),
-            }
-
-            # Stream response directly from Ollama
-            return StreamingResponse(
-                stream_ollama_response(self.client, f"{base_url}/api/chat", ollama_data),
-                media_type="application/x-ndjson",
-                headers={
-                    "Cache-Control": "no-cache",
-                    "Connection": "keep-alive",
-                },
-            )
-
         @self.router.post(f"/{self.provider_route_name}/api/generate")
-        async def ollama_generate(
-            request: Request,
-            authorization: str = Header(..., description="Bearer token"),
-        ):
-            if not authorization.startswith("Bearer "):
-                raise HTTPException(status_code=401, detail="Invalid authorization header")
-
-            _api_key = authorization.split(" ")[1]
-            body = await request.body()
-            data = json.loads(body)
-
-            # Get the Ollama base URL
-            config = Config.get_config()
-            base_url = config.provider_urls.get("ollama", "http://localhost:11434")
-
-            # Prepare generate request
-            ollama_data = {
-                "model": data.get("model", "").strip(),
-                "prompt": data.get("prompt", ""),
-                "stream": True,
-                "options": data.get("options", {}),
-            }
-
-            # Add any context or system prompt if provided
-            if "context" in data:
-                ollama_data["context"] = data["context"]
-            if "system" in data:
-                ollama_data["system"] = data["system"]
-
-            # Stream response directly from Ollama
-            return StreamingResponse(
-                stream_ollama_response(self.client, f"{base_url}/api/generate", ollama_data),
-                media_type="application/x-ndjson",
-                headers={
-                    "Cache-Control": "no-cache",
-                    "Connection": "keep-alive",
-                },
-            )
-
         # OpenAI-compatible routes for backward compatibility
         @self.router.post(f"/{self.provider_route_name}/chat/completions")
         @self.router.post(f"/{self.provider_route_name}/completions")
-        async def create_completion(
-            request: Request,
-            authorization: str = Header(..., description="Bearer token"),
-        ):
-            # Redirect to native Ollama endpoint
-            return await ollama_chat(request, authorization)
+        async def create_completion(request: Request):
+            body = await request.body()
+            data = json.loads(body)
+            if "base_url" not in data or not data["base_url"]:
+                data["base_url"] = self.base_url
+
+            is_fim_request = self._is_fim_request(request, data)
+            stream = await self.complete(data, None, is_fim_request=is_fim_request)
+            return self._completion_handler.create_response(stream)
diff --git a/tests/providers/ollama/test_ollama_provider.py b/tests/providers/ollama/test_ollama_provider.py
@@ -74,7 +74,6 @@ def test_ollama_chat(mock_config, test_client):
         assert sent_data["model"] == "codellama:7b-instruct"
         assert sent_data["messages"] == data["messages"]
         assert sent_data["options"] == data["options"]
-        assert sent_data["stream"] is True
 
 
 @patch("codegate.config.Config.get_config", return_value=MockConfig())
@@ -120,7 +119,6 @@ def test_ollama_generate(mock_config, test_client):
         assert sent_data["options"] == data["options"]
         assert sent_data["context"] == data["context"]
         assert sent_data["system"] == data["system"]
-        assert sent_data["stream"] is True
 
 
 @patch("codegate.config.Config.get_config", return_value=MockConfig())
@@ -140,18 +138,3 @@ def test_ollama_error_handling(mock_config, test_client):
         content = response.content.decode().strip()
         assert "error" in content
         assert "Model not found" in content
-
-
-def test_ollama_auth_required(test_client):
-    """Test authentication requirement."""
-    data = {"model": "codellama:7b-instruct"}
-
-    # Test without auth header
-    response = test_client.post("/ollama/api/generate", json=data)
-    assert response.status_code == 422
-
-    # Test with invalid auth header
-    response = test_client.post(
-        "/ollama/api/generate", json=data, headers={"Authorization": "Invalid"}
-    )
-    assert response.status_code == 401

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`"openai": "https://api.openai.com/v1",`
`20`	`20`	`"anthropic": "https://api.anthropic.com/v1",`
`21`	`21`	`"vllm": "http://localhost:8000", # Base URL without /v1 path`
`22`		`- "ollama": "http://localhost:11434", # Default Ollama server URL`
	`22`	`+ "ollama": "http://localhost:11434/api", # Default Ollama server URL`
`23`	`23`	`}`
`24`	`24`
`25`	`25`