@@ -26,23 +26,45 @@ def setup_schema(client):
26
26
)
27
27
28
28
29
+ def generate_vector_string (package ):
30
+ vector_str = f"{ package ['name' ]} "
31
+ # add description
32
+ package_url = ""
33
+ if package ["type" ] == "pypi" :
34
+ vector_str += " is a Python package available on PyPI"
35
+ package_url = f"https://trustypkg.dev/pypi/{ package ['name' ]} "
36
+ elif package ["type" ] == "npm" :
37
+ vector_str += " is a JavaScript package available on NPM"
38
+ package_url = f"https://trustypkg.dev/npm/{ package ['name' ]} "
39
+ elif package ["type" ] == "go" :
40
+ vector_str += " is a Go package. "
41
+ package_url = f"https://trustypkg.dev/go/{ package ['name' ]} "
42
+ elif package ["type" ] == "crates" :
43
+ vector_str += " is a Rust package available on Crates. "
44
+ package_url = f"https://trustypkg.dev/crates/{ package ['name' ]} "
45
+ elif package ["type" ] == "java" :
46
+ vector_str += " is a Java package. "
47
+ package_url = f"https://trustypkg.dev/java/{ package ['name' ]} "
48
+
49
+ # add extra status
50
+ if package ["status" ] == "archived" :
51
+ vector_str += f". However, this package is found to be archived and no longer maintained. For additional information refer to { package_url } "
52
+ elif package ["status" ] == "deprecated" :
53
+ vector_str += f". However, this package is found to be deprecated and no longer recommended for use. For additional information refer to { package_url } "
54
+ elif package ["status" ] == "malicious" :
55
+ vector_str += f". However, this package is found to be malicious. For additional information refer to { package_url } "
56
+ return vector_str
57
+
58
+
29
59
def add_data (client ):
30
60
collection = client .collections .get ("Package" )
31
61
32
62
for json_file in json_files :
33
63
with open (json_file , 'r' ) as f :
34
64
print ("Adding data from" , json_file )
35
- counter = 0
36
65
with collection .batch .dynamic () as batch :
37
66
for line in f :
38
67
package = json .loads (line )
39
- counter += 1
40
- if counter > 100 :
41
- break
42
-
43
- # prepare the object for embedding
44
- vector_str = f"{ package ['name' ]} { package ['description' ]} "
45
- vector = generate_embeddings (vector_str )
46
68
47
69
# now add the status column
48
70
if 'archived' in json_file :
@@ -54,6 +76,10 @@ def add_data(client):
54
76
else :
55
77
package ['status' ] = 'unknown'
56
78
79
+ # prepare the object for embedding
80
+ vector_str = generate_vector_string (package )
81
+ vector = generate_embeddings (vector_str )
82
+
57
83
batch .add_object (properties = package , vector = vector )
58
84
59
85
0 commit comments