From 023fd6369957ba02373c40e2eb10c85c7911a413 Mon Sep 17 00:00:00 2001 From: Jakub Hrozek Date: Thu, 5 Dec 2024 17:42:39 +0100 Subject: [PATCH 1/2] Add comments on archived, malicious or deprecated packages in snippets Uses the storage engine to find information on packages parsed from the code snippet created by LLM. In addition to printing all the packages, if any package is marked as deprecated, archived or malicious, it is also warned about loudly. --- src/codegate/llm_utils/extractor.py | 4 ++ .../pipeline/extract_snippets/output.py | 37 ++++++++++++++----- src/codegate/storage/storage_engine.py | 34 ++++++++++++++++- 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/src/codegate/llm_utils/extractor.py b/src/codegate/llm_utils/extractor.py index e50bdf5e..5635062a 100644 --- a/src/codegate/llm_utils/extractor.py +++ b/src/codegate/llm_utils/extractor.py @@ -4,6 +4,7 @@ from codegate.config import Config from codegate.llm_utils.llmclient import LLMClient +from codegate.storage import StorageEngine logger = structlog.get_logger("codegate") @@ -13,6 +14,9 @@ class PackageExtractor: Utility class to extract package names from code or queries. """ + def __init__(self): + self.storage_engine = StorageEngine() + @staticmethod async def extract_packages( content: str, diff --git a/src/codegate/pipeline/extract_snippets/output.py b/src/codegate/pipeline/extract_snippets/output.py index 16e742dd..bf9243d6 100644 --- a/src/codegate/pipeline/extract_snippets/output.py +++ b/src/codegate/pipeline/extract_snippets/output.py @@ -8,6 +8,7 @@ from codegate.pipeline.base import CodeSnippet, PipelineContext, PipelineSensitiveData from codegate.pipeline.extract_snippets.extract_snippets import extract_snippets from codegate.pipeline.output import OutputPipelineContext, OutputPipelineStep +from codegate.storage import StorageEngine logger = structlog.get_logger("codegate") @@ -16,7 +17,7 @@ class CodeCommentStep(OutputPipelineStep): """Pipeline step that adds comments after code blocks""" def __init__(self): - pass + self._storage_engine = StorageEngine() @property def name(self) -> str: @@ -41,8 +42,7 @@ def _create_chunk(self, original_chunk: ModelResponse, content: str) -> ModelRes object="chat.completion.chunk", ) - @staticmethod - async def _snippet_comment(snippet: CodeSnippet, secrets: PipelineSensitiveData) -> str: + async def _snippet_comment(self, snippet: CodeSnippet, secrets: PipelineSensitiveData) -> str: """Create a comment for a snippet""" snippet.libraries = await PackageExtractor.extract_packages( content=snippet.code, @@ -52,14 +52,33 @@ async def _snippet_comment(snippet: CodeSnippet, secrets: PipelineSensitiveData) base_url=secrets.api_base, ) + libobjects = await self._storage_engine.search_by_property("name", snippet.libraries) + logger.info(f"Found {len(libobjects)} libraries in the storage engine") + libraries_text = "" + warnings = [] + + # Use snippet.libraries to generate a CSV list of libraries if snippet.libraries: - libraries_text = " ".join(f"`{lib}`" for lib in snippet.libraries) - if libraries_text: - comment = f"\nThe above code snippet uses the following libraries: {libraries_text}\n" - else: - comment = "\ncodegate didn't detect any libraries in the snippet\n" - comment += "\n" + libraries_text = ", ".join([f"`{lib}`" for lib in snippet.libraries]) + + for lib in libobjects: + lib_name = lib.properties["name"] + lib_status = lib.properties["status"] + lib_url = f"https://www.insight.stacklok.com/report/{lib.properties['type']}/{lib_name}" + + warnings.append( + f"- The package `{lib_name}` is marked as **{lib_status}**.\n" + f"- More information: [{lib_url}]({lib_url})\n" + ) + + comment = "" + if libraries_text != "": + comment += f"\n\nCodegate detected the following libraries: {libraries_text}\n" + + if warnings: + comment += "\n### 🚨 Warnings\n" + "\n".join(warnings) + "\n" + return comment def _split_chunk_at_code_end(self, content: str) -> tuple[str, str]: diff --git a/src/codegate/storage/storage_engine.py b/src/codegate/storage/storage_engine.py index aa9d38b5..d4421117 100644 --- a/src/codegate/storage/storage_engine.py +++ b/src/codegate/storage/storage_engine.py @@ -1,8 +1,10 @@ +from typing import List + import structlog import weaviate import weaviate.classes as wvc from weaviate.classes.config import DataType -from weaviate.classes.query import MetadataQuery +from weaviate.classes.query import Filter, MetadataQuery from weaviate.embedded import EmbeddedOptions from codegate.config import Config @@ -88,6 +90,36 @@ def setup_schema(self, client): ) logger.info(f"Weaviate schema for class {class_config['name']} setup complete.") + async def search_by_property(self, name: str, properties: List[str]) -> list[object]: + if len(properties) == 0: + return [] + + # Perform the vector search + weaviate_client = self.get_client(self.data_path) + if weaviate_client is None: + logger.error("Could not find client, not returning results.") + return [] + + if not weaviate_client: + logger.error("Invalid client, cannot perform search.") + return [] + + try: + weaviate_client.connect() + packages = weaviate_client.collections.get("Package") + response = packages.query.fetch_objects( + filters=Filter.by_property(name).contains_any(properties), + ) + + if not response: + return [] + return response.objects + except Exception as e: + logger.error(f"An error occurred: {str(e)}") + return [] + finally: + weaviate_client.close() + async def search(self, query: str, limit=5, distance=0.3, packages=None) -> list[object]: """ Search the 'Package' collection based on a query string. From e61d78214add77ea7d40b27396c1bf8422dccfd1 Mon Sep 17 00:00:00 2001 From: Jakub Hrozek Date: Thu, 5 Dec 2024 22:49:48 +0100 Subject: [PATCH 2/2] Do not import src.codegate, just codegate --- src/codegate/cli.py | 2 +- src/codegate/llm_utils/__init__.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/codegate/cli.py b/src/codegate/cli.py index 86a4e6af..fdd5c4e0 100644 --- a/src/codegate/cli.py +++ b/src/codegate/cli.py @@ -11,7 +11,7 @@ from codegate.config import Config, ConfigurationError from codegate.db.connection import init_db_sync from codegate.server import init_app -from src.codegate.storage.utils import restore_storage_backup +from codegate.storage.utils import restore_storage_backup def validate_port(ctx: click.Context, param: click.Parameter, value: int) -> int: diff --git a/src/codegate/llm_utils/__init__.py b/src/codegate/llm_utils/__init__.py index b37716f8..c688370e 100644 --- a/src/codegate/llm_utils/__init__.py +++ b/src/codegate/llm_utils/__init__.py @@ -1,4 +1,4 @@ -from src.codegate.llm_utils.extractor import PackageExtractor -from src.codegate.llm_utils.llmclient import LLMClient +from codegate.llm_utils.extractor import PackageExtractor +from codegate.llm_utils.llmclient import LLMClient __all__ = ["LLMClient", "PackageExtractor"]