Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Warn about malicious, deprecated or archived packages in output snippets #215

Merged
merged 2 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/codegate/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from codegate.config import Config, ConfigurationError
from codegate.db.connection import init_db_sync
from codegate.server import init_app
from src.codegate.storage.utils import restore_storage_backup
from codegate.storage.utils import restore_storage_backup


def validate_port(ctx: click.Context, param: click.Parameter, value: int) -> int:
Expand Down
4 changes: 2 additions & 2 deletions src/codegate/llm_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from src.codegate.llm_utils.extractor import PackageExtractor
from src.codegate.llm_utils.llmclient import LLMClient
from codegate.llm_utils.extractor import PackageExtractor
from codegate.llm_utils.llmclient import LLMClient

__all__ = ["LLMClient", "PackageExtractor"]
4 changes: 4 additions & 0 deletions src/codegate/llm_utils/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from codegate.config import Config
from codegate.llm_utils.llmclient import LLMClient
from codegate.storage import StorageEngine

logger = structlog.get_logger("codegate")

Expand All @@ -13,6 +14,9 @@ class PackageExtractor:
Utility class to extract package names from code or queries.
"""

def __init__(self):
self.storage_engine = StorageEngine()

@staticmethod
async def extract_packages(
content: str,
Expand Down
37 changes: 28 additions & 9 deletions src/codegate/pipeline/extract_snippets/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from codegate.pipeline.base import CodeSnippet, PipelineContext, PipelineSensitiveData
from codegate.pipeline.extract_snippets.extract_snippets import extract_snippets
from codegate.pipeline.output import OutputPipelineContext, OutputPipelineStep
from codegate.storage import StorageEngine

logger = structlog.get_logger("codegate")

Expand All @@ -16,7 +17,7 @@ class CodeCommentStep(OutputPipelineStep):
"""Pipeline step that adds comments after code blocks"""

def __init__(self):
pass
self._storage_engine = StorageEngine()

@property
def name(self) -> str:
Expand All @@ -41,8 +42,7 @@ def _create_chunk(self, original_chunk: ModelResponse, content: str) -> ModelRes
object="chat.completion.chunk",
)

@staticmethod
async def _snippet_comment(snippet: CodeSnippet, secrets: PipelineSensitiveData) -> str:
async def _snippet_comment(self, snippet: CodeSnippet, secrets: PipelineSensitiveData) -> str:
"""Create a comment for a snippet"""
snippet.libraries = await PackageExtractor.extract_packages(
content=snippet.code,
Expand All @@ -52,14 +52,33 @@ async def _snippet_comment(snippet: CodeSnippet, secrets: PipelineSensitiveData)
base_url=secrets.api_base,
)

libobjects = await self._storage_engine.search_by_property("name", snippet.libraries)
logger.info(f"Found {len(libobjects)} libraries in the storage engine")

libraries_text = ""
warnings = []

# Use snippet.libraries to generate a CSV list of libraries
if snippet.libraries:
libraries_text = " ".join(f"`{lib}`" for lib in snippet.libraries)
if libraries_text:
comment = f"\nThe above code snippet uses the following libraries: {libraries_text}\n"
else:
comment = "\ncodegate didn't detect any libraries in the snippet\n"
comment += "\n"
libraries_text = ", ".join([f"`{lib}`" for lib in snippet.libraries])

for lib in libobjects:
lib_name = lib.properties["name"]
lib_status = lib.properties["status"]
lib_url = f"https://www.insight.stacklok.com/report/{lib.properties['type']}/{lib_name}"

warnings.append(
f"- The package `{lib_name}` is marked as **{lib_status}**.\n"
f"- More information: [{lib_url}]({lib_url})\n"
)

comment = ""
if libraries_text != "":
comment += f"\n\nCodegate detected the following libraries: {libraries_text}\n"

if warnings:
comment += "\n### 🚨 Warnings\n" + "\n".join(warnings) + "\n"

return comment

def _split_chunk_at_code_end(self, content: str) -> tuple[str, str]:
Expand Down
34 changes: 33 additions & 1 deletion src/codegate/storage/storage_engine.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import List

import structlog
import weaviate
import weaviate.classes as wvc
from weaviate.classes.config import DataType
from weaviate.classes.query import MetadataQuery
from weaviate.classes.query import Filter, MetadataQuery
from weaviate.embedded import EmbeddedOptions

from codegate.config import Config
Expand Down Expand Up @@ -88,6 +90,36 @@ def setup_schema(self, client):
)
logger.info(f"Weaviate schema for class {class_config['name']} setup complete.")

async def search_by_property(self, name: str, properties: List[str]) -> list[object]:
if len(properties) == 0:
return []

# Perform the vector search
weaviate_client = self.get_client(self.data_path)
if weaviate_client is None:
logger.error("Could not find client, not returning results.")
return []

if not weaviate_client:
logger.error("Invalid client, cannot perform search.")
return []

try:
weaviate_client.connect()
packages = weaviate_client.collections.get("Package")
response = packages.query.fetch_objects(
filters=Filter.by_property(name).contains_any(properties),
)

if not response:
return []
return response.objects
except Exception as e:
logger.error(f"An error occurred: {str(e)}")
return []
finally:
weaviate_client.close()

async def search(self, query: str, limit=5, distance=0.3, packages=None) -> list[object]:
"""
Search the 'Package' collection based on a query string.
Expand Down