Skip to content

Commit 72c1e48

Browse files
committed
feat(vector-db): add cve_packages table
1 parent 3a75004 commit 72c1e48

File tree

2 files changed

+44
-8
lines changed

2 files changed

+44
-8
lines changed

Diff for: .github/workflows/import_packages.yml

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ jobs:
4747
MALICIOUS_KEY=$(jq -r '.latest.malicious_packages' manifest.json)
4848
DEPRECATED_KEY=$(jq -r '.latest.deprecated_packages' manifest.json)
4949
ARCHIVED_KEY=$(jq -r '.latest.archived_packages' manifest.json)
50+
VULNERABLE_KEY=$(jq -r '.latest.vulnerable_packages' manifest.json)
5051
5152
echo "Malicious key: $MALICIOUS_KEY"
5253
echo "Deprecated key: $DEPRECATED_KEY"
@@ -58,6 +59,7 @@ jobs:
5859
aws s3 cp s3://codegate-data-prod/$MALICIOUS_KEY /tmp/jsonl-files/malicious.jsonl --region $AWS_REGION
5960
aws s3 cp s3://codegate-data-prod/$DEPRECATED_KEY /tmp/jsonl-files/deprecated.jsonl --region $AWS_REGION
6061
aws s3 cp s3://codegate-data-prod/$ARCHIVED_KEY /tmp/jsonl-files/archived.jsonl --region $AWS_REGION
62+
aws s3 cp s3://codegate-data-prod/$VULNERABLE_KEY /tmp/jsonl-files/vulnerable.jsonl --region $AWS_REGION
6163
6264
- name: Install Poetry
6365
run: |

Diff for: scripts/import_packages.py

+42-8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def __init__(self, jsonl_dir="data", vec_db_path="./sqlite_data/vectordb.db"):
2020
os.path.join(jsonl_dir, "archived.jsonl"),
2121
os.path.join(jsonl_dir, "deprecated.jsonl"),
2222
os.path.join(jsonl_dir, "malicious.jsonl"),
23+
os.path.join(jsonl_dir, "vulnerable.jsonl"),
2324
]
2425
self.conn = self._get_connection()
2526
Config.load() # Load the configuration
@@ -48,13 +49,41 @@ def setup_schema(self):
4849
"""
4950
)
5051

52+
# table for packages that has at least one vulnerability high or critical
53+
cursor.execute(
54+
"""
55+
CREATE TABLE cve_packages (
56+
name TEXT NOT NULL,
57+
version TEXT NOT NULL,
58+
type TEXT NOT NULL
59+
)
60+
"""
61+
)
62+
5163
# Create indexes for faster querying
5264
cursor.execute("CREATE INDEX IF NOT EXISTS idx_name ON packages(name)")
5365
cursor.execute("CREATE INDEX IF NOT EXISTS idx_type ON packages(type)")
5466
cursor.execute("CREATE INDEX IF NOT EXISTS idx_status ON packages(status)")
67+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_pkg_cve_name ON cve_packages(name)")
68+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_pkg_cve_type ON cve_packages(type)")
69+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_pkg_cve_version ON cve_packages(version)")
5570

5671
self.conn.commit()
5772

73+
async def process_cve_packages(self, package):
74+
cursor = self.conn.cursor()
75+
cursor.execute(
76+
"""
77+
INSERT INTO cve_packages (name, version, type) VALUES (?, ?, ?)
78+
""",
79+
(
80+
package["name"],
81+
package["version"],
82+
package["type"],
83+
),
84+
)
85+
self.conn.commit()
86+
5887
async def process_package(self, package):
5988
vector_str = generate_vector_string(package)
6089
vector = await self.inference_engine.embed(
@@ -101,14 +130,19 @@ async def add_data(self):
101130
package["status"] = json_file.split("/")[-1].split(".")[0]
102131
key = f"{package['name']}/{package['type']}"
103132

104-
if key in existing_packages and existing_packages[key] == {
105-
"status": package["status"],
106-
"description": package["description"],
107-
}:
108-
print("Package already exists", key)
109-
continue
110-
111-
await self.process_package(package)
133+
if package["status"] == "vulnerable":
134+
# Process vulnerable packages using the cve flow
135+
await self.process_cve_packages(package)
136+
else:
137+
# For non-vulnerable packages, check for duplicates and process normally
138+
if key in existing_packages and existing_packages[key] == {
139+
"status": package["status"],
140+
"description": package["description"],
141+
}:
142+
print("Package already exists", key)
143+
continue
144+
145+
await self.process_package(package)
112146

113147
async def run_import(self):
114148
self.setup_schema()

0 commit comments

Comments
 (0)