Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

sqlite-vec vectorization database #438

Merged
merged 18 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ htmlcov/
# Weaviate
weaviate_data/

# CodeGate Dashboard DB
# CodeGate Dashboard DB & VectoroDB
codegate.db
sqlite_data/vectordb.db

# certificate directory
*certs/
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

## Introduction

<img src="./assets/codegate.gif" style="width: 70%; height: 70%;" />

CodeGate is a local gateway that makes AI coding assistants safer. CodeGate
ensures AI-generated recommendations adhere to best practices, while
safeguarding your code's integrity, and protecting your individual privacy. With
Expand Down
Binary file added assets/codegate.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 6 additions & 17 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,15 @@ host: "localhost" # Host to bind to (use localhost for all interfaces)

# Logging configuration
log_level: "INFO" # One of: ERROR, WARNING, INFO, DEBUG
log_format: "JSON" # One of: JSON, TEXT

# Note: This configuration can be overridden by:
# 1. CLI arguments (--port, --host, --log-level)
# 2. Environment variables (CODEGATE_APP_PORT, CODEGATE_APP_HOST, CODEGATE_APP_LOG_LEVEL)


# Embedding model configuration

####
# Inference model configuration
##

# Model to use for chatting
# Model configuration
model_base_path: "./codegate_volume/models"
embedding_model: "all-minilm-L6-v2-q5_k_m.gguf"

# Context length of the model
# Chat model configuration
chat_model_n_ctx: 32768

# Number of layers to offload to GPU. If -1, all layers are offloaded.
chat_model_n_gpu_layers: -1

# Embedding model
embedding_model: "all-minilm-L6-v2-q5_k_m.gguf"
# Storage configuration
vec_db_path: "./sqlite_data/vectordb.db" # Path to SQLite vector database for similarity search
5 changes: 5 additions & 0 deletions config.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,12 @@ server_key: "server.key" # Server key file name
# CODEGATE_SERVER_CERT
# CODEGATE_SERVER_KEY

# Storage configuration
vec_db_path: "./sqlite_data/vectordb.db" # Path to SQLite vector database file for similarity search

# Embedding model configuration
model_base_path: "./codegate_volume/models" # Base path for model files
embedding_model: "all-minilm-L6-v2-q5_k_m.gguf" # Model to use for embeddings

####
# Inference model configuration
Expand Down
8 changes: 4 additions & 4 deletions data/malicious.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
{"name":"malicious-pypi-dummy","type":"pypi","description":"Dummy malicious to test with simple package name on pypi"}
{"name":"@prefix/malicious-maven-dummy","type":"maven","description":"Dummy malicious to test with encoded package name on maven"}
{"name":"malicious-maven-dummy","type":"maven","description":"Dummy malicious to test with simple package name on maven"}
{"name":"github.com/malicious-go-dummy","type":"npm","description":"Dummy malicious to test with encoded package name on go"}
{"name":"malicious-go-dummy","type":"npm","description":"Dummy malicious to test with simple package name on go"}
{"name":"@prefix/malicious-crates-dummy","type":"npm","description":"Dummy malicious to test with encoded package name on crates"}
{"name":"malicious-crates-dummy","type":"npm","description":"Dummy malicious to test with simple package name on crates"}
{"name":"github.com/malicious-go-dummy","type":"go","description":"Dummy malicious to test with encoded package name on go"}
{"name":"malicious-go-dummy","type":"go","description":"Dummy malicious to test with simple package name on go"}
{"name":"@prefix/malicious-crates-dummy","type":"crates","description":"Dummy malicious to test with encoded package name on crates"}
{"name":"malicious-crates-dummy","type":"crates","description":"Dummy malicious to test with simple package name on crates"}
275 changes: 26 additions & 249 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ click = ">=8.1.0"
PyYAML = ">=6.0.1"
fastapi = ">=0.115.5"
uvicorn = ">=0.32.1"
weaviate-client = ">=4.9.6"
structlog = ">=24.4.0"
litellm = "^1.55.4"
llama_cpp_python = ">=0.3.2"
Expand All @@ -21,6 +20,8 @@ greenlet = "^3.0.3"
aiosqlite = "^0.20.0"
ollama = ">=0.4.4"
pydantic-settings = "^2.7.0"
sqlite-vec = ">=0.1.0"
numpy = ">=1.24.0"

[tool.poetry.group.dev.dependencies]
pytest = ">=7.4.0"
Expand Down
188 changes: 80 additions & 108 deletions scripts/import_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,167 +2,139 @@
import asyncio
import json
import os
import shutil
import sqlite3

import weaviate
from weaviate.classes.config import DataType, Property
from weaviate.embedded import EmbeddedOptions
from weaviate.util import generate_uuid5
import numpy as np
import sqlite_vec

from codegate.inference.inference_engine import LlamaCppInferenceEngine
from codegate.utils.utils import generate_vector_string


class PackageImporter:
def __init__(self, jsonl_dir="data", take_backup=True, restore_backup=True):
self.take_backup_flag = take_backup
self.restore_backup_flag = restore_backup

self.client = weaviate.WeaviateClient(
embedded_options=EmbeddedOptions(
persistence_data_path="./weaviate_data",
grpc_port=50052,
additional_env_vars={
"ENABLE_MODULES": "backup-filesystem",
"BACKUP_FILESYSTEM_PATH": os.getenv("BACKUP_FILESYSTEM_PATH", "/tmp"),
},
)
)
def __init__(self, jsonl_dir="data", db_path="./sqlite_data/vectordb.db"):
os.makedirs(os.path.dirname(db_path), exist_ok=True)
self.db_path = db_path
self.json_files = [
os.path.join(jsonl_dir, "archived.jsonl"),
os.path.join(jsonl_dir, "deprecated.jsonl"),
os.path.join(jsonl_dir, "malicious.jsonl"),
]
self.client.connect()
self.conn = self._get_connection()
self.inference_engine = LlamaCppInferenceEngine()
self.model_path = "./codegate_volume/models/all-minilm-L6-v2-q5_k_m.gguf"

def restore_backup(self):
if os.getenv("BACKUP_FOLDER"):
try:
self.client.backup.restore(
backup_id=os.getenv("BACKUP_FOLDER"),
backend="filesystem",
wait_for_completion=True,
)
except Exception as e:
print(f"Failed to restore backup: {e}")

def take_backup(self):
# if backup folder exists, remove it
backup_path = os.path.join(
os.getenv("BACKUP_FILESYSTEM_PATH", "/tmp"), os.getenv("BACKUP_TARGET_ID", "backup")
)
if os.path.exists(backup_path):
shutil.rmtree(backup_path)

#  take a backup of the data
try:
self.client.backup.create(
backup_id=os.getenv("BACKUP_TARGET_ID", "backup"),
backend="filesystem",
wait_for_completion=True,
)
except Exception as e:
print(f"Failed to take backup: {e}")
def _get_connection(self):
conn = sqlite3.connect(self.db_path)
conn.enable_load_extension(True)
sqlite_vec.load(conn)
conn.enable_load_extension(False)
return conn

def setup_schema(self):
if not self.client.collections.exists("Package"):
self.client.collections.create(
"Package",
properties=[
Property(name="name", data_type=DataType.TEXT),
Property(name="type", data_type=DataType.TEXT),
Property(name="status", data_type=DataType.TEXT),
Property(name="description", data_type=DataType.TEXT),
],
cursor = self.conn.cursor()
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS packages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
type TEXT NOT NULL,
status TEXT NOT NULL,
description TEXT,
embedding BLOB
)
"""
)

# Create indexes for faster querying
cursor.execute("CREATE INDEX IF NOT EXISTS idx_name ON packages(name)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_type ON packages(type)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_status ON packages(status)")

async def process_package(self, batch, package):
self.conn.commit()

async def process_package(self, package):
vector_str = generate_vector_string(package)
vector = await self.inference_engine.embed(self.model_path, [vector_str])
# This is where the synchronous call is made
batch.add_object(properties=package, vector=vector[0])
vector_array = np.array(vector[0], dtype=np.float32)

cursor = self.conn.cursor()
cursor.execute(
"""
INSERT INTO packages (name, type, status, description, embedding)
VALUES (?, ?, ?, ?, ?)
""",
(
package["name"],
package["type"],
package["status"],
package["description"],
vector_array, # sqlite-vec will handle numpy arrays directly
),
)
self.conn.commit()

async def add_data(self):
collection = self.client.collections.get("Package")
existing_packages = list(collection.iterator())
packages_dict = {
f"{package.properties['name']}/{package.properties['type']}": {
"status": package.properties["status"],
"description": package.properties["description"],
}
for package in existing_packages
cursor = self.conn.cursor()

# Get existing packages
cursor.execute(
"""
SELECT name, type, status, description
FROM packages
"""
)
existing_packages = {
f"{row[0]}/{row[1]}": {"status": row[2], "description": row[3]}
for row in cursor.fetchall()
}

for json_file in self.json_files:
print("Adding data from", json_file)
with open(json_file, "r") as f:
print("Adding data from", json_file)
packages_to_insert = []
for line in f:
package = json.loads(line)
package["status"] = json_file.split("/")[-1].split(".")[0]
key = f"{package['name']}/{package['type']}"

if key in packages_dict and packages_dict[key] == {
if key in existing_packages and existing_packages[key] == {
"status": package["status"],
"description": package["description"],
}:
print("Package already exists", key)
continue

vector_str = generate_vector_string(package)
vector = await self.inference_engine.embed(self.model_path, [vector_str])
packages_to_insert.append((package, vector[0]))

# Synchronous batch insert after preparing all data
with collection.batch.dynamic() as batch:
for package, vector in packages_to_insert:
batch.add_object(
properties=package, vector=vector, uuid=generate_uuid5(package)
)
await self.process_package(package)

async def run_import(self):
if self.restore_backup_flag:
self.restore_backup()
self.setup_schema()
await self.add_data()
if self.take_backup_flag:
self.take_backup()

def __del__(self):
try:
if hasattr(self, "conn"):
self.conn.close()
except Exception as e:
print(f"Failed to close connection: {e}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Run the package importer with optional backup flags."
)
parser.add_argument(
"--take-backup",
type=lambda x: x.lower() == "true",
default=True,
help="Specify whether to take a backup after "
"data import (True or False). Default is True.",
)
parser.add_argument(
"--restore-backup",
type=lambda x: x.lower() == "true",
default=True,
help="Specify whether to restore a backup before "
"data import (True or False). Default is True.",
description="Import packages into SQLite database with vector search capabilities."
)
parser.add_argument(
"--jsonl-dir",
type=str,
default="data",
help="Directory containing JSONL files. Default is 'data'.",
)
parser.add_argument(
"--db-path",
type=str,
default="./sqlite_data/vectordb.db",
help="Path to SQLite database file. Default is './sqlite_data/vectordb.db'.",
)
args = parser.parse_args()

importer = PackageImporter(
jsonl_dir=args.jsonl_dir, take_backup=args.take_backup, restore_backup=args.restore_backup
)
importer = PackageImporter(jsonl_dir=args.jsonl_dir, db_path=args.db_path)
asyncio.run(importer.run_import())
try:
assert importer.client.is_live()
pass
finally:
importer.client.close()
Binary file added sqlite_data/vectordb.db
Binary file not shown.
11 changes: 5 additions & 6 deletions src/codegate/ca/codegate_ca.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,12 @@ def _load_existing_certificates(self) -> None:
expiry_date = current_time + timedelta(days=TLS_GRACE_PERIOD_DAYS)

for filename in os.listdir(certs_dir):
if (
filename.endswith('.crt') and
filename not in [Config.get_config().ca_cert, Config.get_config().server_cert]
):
if filename.endswith(".crt") and filename not in [
Config.get_config().ca_cert,
Config.get_config().server_cert,
]:
cert_path = os.path.join(certs_dir, filename)
key_path = os.path.join(certs_dir, filename.replace('.crt', '.key'))
key_path = os.path.join(certs_dir, filename.replace(".crt", ".key"))

# Skip if key file doesn't exist
if not os.path.exists(key_path):
Expand Down Expand Up @@ -320,7 +320,6 @@ def generate_ca_certificates(self) -> None:
self._ca_cert_expiry = self._ca_cert.not_valid_after_utc
self._ca_last_load_time = datetime.now(timezone.utc)


# Define file paths for certificate and key
ca_cert_path = self.get_cert_path(Config.get_config().ca_cert)
ca_key_path = self.get_cert_path(Config.get_config().ca_key)
Expand Down
Loading
Loading