Skip to content

Commit 93366bf

Browse files
drewoldagCopilot
andauthored
Removes references to "table.sub_table" configuration settings when inappropriate (#405)
* Removes references to "table.sub_table" configuration settings where they are not needed. * Update benchmarks/vector_db_benchmarks.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fixing another reference to the old double quote subtable. * Updating subtables from `hyrax_cnn` to `HyraxCNN` and `random_dataset` to `HyraxRandomDataset`. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 5bccee4 commit 93366bf

13 files changed

Lines changed: 46 additions & 46 deletions

File tree

benchmarks/vector_db_benchmarks.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@ def setup(self, vector_length, vector_db_implementation):
2828
self.h.config["model"]["name"] = "HyraxLoopback"
2929

3030
# Default inference batch size is 512, so this should result in 4 batch files
31-
self.h.config["data_set.random_dataset"]["size"] = 2048
32-
self.h.config["data_set.random_dataset"]["seed"] = 0
33-
self.h.config["data_set.random_dataset"]["shape"] = [vector_length]
31+
self.h.config["data_set"]["HyraxRandomDataset"]["size"] = 2048
32+
self.h.config["data_set"]["HyraxRandomDataset"]["seed"] = 0
33+
self.h.config["data_set"]["HyraxRandomDataset"]["shape"] = [vector_length]
3434

3535
# Qdrant requires the vector size in order to create its collections
36-
self.h.config["vector_db.qdrant"]["vector_size"] = vector_length
36+
self.h.config["vector_db"]["qdrant"]["vector_size"] = vector_length
3737

3838
weights_file = self.input_dir / "fakeweights"
3939
with open(weights_file, "a"):
@@ -88,9 +88,9 @@ def setup(self, shard_size_limit, vector_db_implementation):
8888
self.h.config["model"]["name"] = "HyraxLoopback"
8989

9090
# Default inference batch size is 512, so this should result in 4 batch files
91-
self.h.config["data_set.random_dataset"]["size"] = 4096
92-
self.h.config["data_set.random_dataset"]["seed"] = 0
93-
self.h.config["data_set.random_dataset"]["shape"] = [self.vector_length]
91+
self.h.config["data_set"]["HyraxRandomDataset"]["size"] = 4096
92+
self.h.config["data_set"]["HyraxRandomDataset"]["seed"] = 0
93+
self.h.config["data_set"]["HyraxRandomDataset"]["shape"] = [1024]
9494

9595
# Create a fake weights file and then run inference on the random dataset
9696
weights_file = self.input_dir / "fakeweights"
@@ -105,9 +105,9 @@ def setup(self, shard_size_limit, vector_db_implementation):
105105
self.data_sample = self.ds[4001]["image"].numpy()
106106

107107
self.h.config["vector_db"]["name"] = vector_db_implementation
108-
self.h.config["vector_db.chromadb"]["shard_size_limit"] = shard_size_limit
108+
self.h.config["vector_db"]["chromadb"]["shard_size_limit"] = shard_size_limit
109109
# Qdrant requires the vector size in order to create its collections
110-
self.h.config["vector_db.qdrant"]["vector_size"] = self.vector_length
110+
self.h.config["vector_db"]["qdrant"]["vector_size"] = 4096
111111

112112
# Save inference results to vector database and create a db connection
113113
self.h.save_to_database(output_dir=Path(self.output_dir))

src/hyrax/data_sets/random/hyrax_random_dataset.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -51,34 +51,34 @@ def __init__(self, config):
5151
Valid values are "nan", "inf", "-inf", "none", or a float value.
5252
"""
5353
# The total number of random data samples produced
54-
data_size = config["data_set.random_dataset"]["size"]
54+
data_size = config["data_set"]["HyraxRandomDataset"]["size"]
5555
if not isinstance(data_size, int):
5656
raise ValueError(
57-
f"Expected integer value for `config['data_set.random_dataset']['size']`, but got {data_size}"
57+
f"Expected integer for `config['data_set']['random_dataset']['size']`, but got {data_size}"
5858
)
5959

6060
# The shape of each random data sample as a tuple.
6161
# i.e. (3, 29, 29) = 3 layers of 2d data, each layer is 29x29 elements.
62-
data_shape = tuple(config["data_set.random_dataset"]["shape"])
62+
data_shape = tuple(config["data_set"]["HyraxRandomDataset"]["shape"])
6363
if not len(data_shape):
6464
raise ValueError(
65-
"Expected `config['data_set.random_dataset']['data_shape']` to have at least 1 value."
65+
"Expected `config['data_set']['random_dataset']['data_shape']` to have at least 1 value."
6666
)
6767

6868
for e in data_shape:
6969
if e < 1:
7070
raise ValueError(
71-
f"Expected all values in `config['data_set.random_dataset']['data_shape']`\
71+
f"Expected all values in `config['data_set']['random_dataset']['data_shape']`\
7272
to be > 0, but got {data_shape}."
7373
)
7474
if not isinstance(e, int):
7575
raise ValueError(
76-
f"Expected all values in `config['data_set.random_dataset']['data_shape']`\
76+
f"Expected all values in `config['data_set']['random_dataset']['data_shape']`\
7777
to be integers, but got {data_shape}."
7878
)
7979

8080
# Random seed to use for reproducibility
81-
seed = config["data_set.random_dataset"]["seed"]
81+
seed = config["data_set"]["HyraxRandomDataset"]["seed"]
8282
rng = np.random.default_rng(seed)
8383

8484
# Note: We raise exceptions if data_size is not an int, so we can assume
@@ -90,23 +90,23 @@ def __init__(self, config):
9090
self.id_list = list(range(id_start, id_start + data_size))
9191

9292
# Randomly insert flawed values (np.nan, np.inf, -np.inf, None, other float)
93-
num_invalid_values = config["data_set.random_dataset"]["number_invalid_values"]
93+
num_invalid_values = config["data_set"]["HyraxRandomDataset"]["number_invalid_values"]
9494
if num_invalid_values:
9595
# Determine what value to use for invalid values
96-
invalid_value_type = config["data_set.random_dataset"]["invalid_value_type"]
96+
invalid_value_type = config["data_set"]["HyraxRandomDataset"]["invalid_value_type"]
9797
if isinstance(invalid_value_type, str):
9898
try:
9999
invalid_value = INVALID_VALUES[invalid_value_type.lower()]
100100
except KeyError as err:
101101
raise ValueError(
102102
f"Invalid value type '{invalid_value_type}' provided. "
103-
f"Expected `config['data_set.random_dataset']['invalid_value_type']` "
104-
f"to beone of {list(INVALID_VALUES.keys())}"
103+
f"Expected `config['data_set']['random_dataset']['invalid_value_type']` "
104+
f"to be one of {list(INVALID_VALUES.keys())}"
105105
) from err
106106
else:
107107
if not isinstance(invalid_value_type, float):
108108
raise ValueError(
109-
f"Expected `config['data_set.random_dataset']['invalid_value_type']` to be "
109+
f"Expected `config['data_set']['random_dataset']['invalid_value_type']` to be "
110110
f"a string or a float, but got {type(invalid_value_type)}."
111111
)
112112
invalid_value = invalid_value_type
@@ -115,7 +115,7 @@ def __init__(self, config):
115115
flattened[random_inds] = invalid_value
116116

117117
# If a list of possible labels is provided, create the random label list.
118-
self.provided_labels = config["data_set.random_dataset"]["provided_labels"]
118+
self.provided_labels = config["data_set"]["HyraxRandomDataset"]["provided_labels"]
119119
if self.provided_labels:
120120
self.labels = rng.choice(self.provided_labels, size=data_size)
121121

src/hyrax/hyrax_default_config.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ latent_dim = 64
9595
final_layer = "tanh"
9696

9797

98-
[model.hyrax_cnn]
98+
[model.HyraxCNN]
9999
# The number of classes to predict as the output of the model. i.e. 2 would be a
100100
# binary classifer, 10 would predict the 10 classes in the CiFAR dataset.
101101
output_classes = 10
@@ -237,7 +237,7 @@ semi_height_deg = 0.00472
237237

238238

239239

240-
["data_set.random_dataset"]
240+
[data_set.HyraxRandomDataset]
241241
# Total number of samples produced by the random dataset
242242
size = 100
243243

@@ -290,7 +290,7 @@ vector_db_dir = false
290290
infer_results_dir = false
291291

292292

293-
["vector_db.chromadb"]
293+
[vector_db.chromadb]
294294
# The approximate maximum size of a shard before creating a new one. A smaller
295295
# value will decrease insert times while increasing search times.
296296
shard_size_limit = 65536
@@ -300,7 +300,7 @@ shard_size_limit = 65536
300300
vector_size_warning = 10000
301301

302302

303-
["vector_db.qdrant"]
303+
[vector_db.qdrant]
304304
# The number of elements in the vectors that will be stored in the vector database.
305305
# This must be the same as the size of the vectors produced by the model.
306306
vector_size = 64
@@ -325,7 +325,7 @@ parallel = false
325325
name = "umap.UMAP"
326326

327327

328-
["umap.UMAP"]
328+
[umap.UMAP]
329329
# Specify any parameter accepted by https://umap-learn.readthedocs.io/en/latest/api.html#umap
330330
# Dimension of the embedded space
331331
n_components = 2

src/hyrax/models/hyrax_cnn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def __init__(self, config, shape=(3, 32, 32)):
4848
self.conv2 = nn.Conv2d(hidden_channels_1, hidden_channels_2, 5)
4949
self.fc1 = nn.Linear(hidden_channels_2 * pool2_end_h * pool2_end_w, 120)
5050
self.fc2 = nn.Linear(120, 84)
51-
self.fc3 = nn.Linear(84, self.config["model"]["hyrax_cnn"]["output_classes"])
51+
self.fc3 = nn.Linear(84, self.config["model"]["HyraxCNN"]["output_classes"])
5252

5353
def conv2d_output_size(self, input_size, kernel_size, padding=0, stride=1, dilation=1) -> int:
5454
# From https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html

src/hyrax/vector_dbs/chromadb_impl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,10 @@ def __init__(self, config, context):
7777
self.shard_size = 0 # The number of vectors in the current shard
7878

7979
# The approximate maximum size of a shard before a new one is created
80-
self.shard_size_limit = self.config["vector_db.chromadb"]["shard_size_limit"]
80+
self.shard_size_limit = self.config["vector_db"]["chromadb"]["shard_size_limit"]
8181

8282
# If set, inserting a vector with number of elements >= this logs a warning.
83-
self.vector_size_limit = self.config["vector_db.chromadb"]["vector_size_warning"]
83+
self.vector_size_limit = self.config["vector_db"]["chromadb"]["vector_size_warning"]
8484

8585
# Min number of shards before using multiprocess to parallelize the search
8686
self.min_shards_for_parallelization = MIN_SHARDS_FOR_PARALLELIZATION
@@ -144,7 +144,7 @@ def insert(self, ids: list[Union[str, int]], vectors: list[np.ndarray]):
144144
logger.warning(
145145
f"Attempting to insert vectors with length: {len(vectors[0])}.\
146146
Chroma DB often has poor performance when working with vectors\
147-
larger than {self.config['vector_db.chromadb']['vector_size_warning']}"
147+
larger than {self.config['vector_db']['chromadb']['vector_size_warning']}"
148148
)
149149

150150
# increment counter, if exceeds shard limit, create a new collection

src/hyrax/vector_dbs/qdrantdb_impl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def create(self):
5858
#! This stinks - we should just check the size of the data
5959
#! when we call `save_to_database` and then set this automatically
6060
#! as a parameter in self.context["blah"] or something.
61-
size=self.config["vector_db.qdrant"]["vector_size"],
61+
size=self.config["vector_db"]["qdrant"]["vector_size"],
6262
distance=models.Distance.EUCLID,
6363
on_disk=True,
6464
),
@@ -85,7 +85,7 @@ def insert(self, ids: list[Union[str, int]], vectors: list[np.ndarray]):
8585
if self.client is None:
8686
self.connect()
8787

88-
expected_size = self.config["vector_db.qdrant"]["vector_size"]
88+
expected_size = self.config["vector_db"]["qdrant"]["vector_size"]
8989
for idx, vector in enumerate(vectors):
9090
if len(vector) != expected_size:
9191
raise ValueError(

src/hyrax/verbs/umap.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def _run(self, input_dir: Optional[Union[Path, str]] = None):
7878
from hyrax.config_utils import create_results_dir
7979
from hyrax.data_sets.inference_dataset import InferenceDataSet, InferenceDataSetWriter
8080

81-
self.reducer = umap.UMAP(**self.config["umap.UMAP"])
81+
self.reducer = umap.UMAP(**self.config["umap"]["UMAP"])
8282

8383
# Load all the latent space data.
8484
inference_results = InferenceDataSet(self.config, results_dir=input_dir)

tests/hyrax/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,9 @@ def loopback_hyrax(tmp_path_factory, request):
105105

106106
h.config["general"]["dev_mode"] = True
107107
h.config["data_set"]["name"] = request.param
108-
h.config["data_set.random_dataset"]["size"] = 20
109-
h.config["data_set.random_dataset"]["seed"] = 0
110-
h.config["data_set.random_dataset"]["shape"] = [2, 3]
108+
h.config["data_set"]["HyraxRandomDataset"]["size"] = 20
109+
h.config["data_set"]["HyraxRandomDataset"]["seed"] = 0
110+
h.config["data_set"]["HyraxRandomDataset"]["shape"] = [2, 3]
111111

112112
h.config["data_set"]["validate_size"] = 0.2
113113
h.config["data_set"]["test_size"] = 0.2

tests/hyrax/test_chromadb_impl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def test_insert_does_not_raise_warning(caplog, tmp_path, random_vector_generator
102102
when the config vector_size_warning is set to `False`."""
103103

104104
h = Hyrax()
105-
h.config["vector_db.chromadb"]["vector_size_warning"] = False
105+
h.config["vector_db"]["chromadb"]["vector_size_warning"] = False
106106
chromadb_instance = ChromaDB(h.config, {"results_dir": tmp_path})
107107
chromadb_instance.connect()
108108
chromadb_instance.create()

tests/hyrax/test_nan.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ def loopback_hyrax_nan(tmp_path_factory, request):
3434

3535
h.config["general"]["dev_mode"] = True
3636
h.config["data_set"]["name"] = request.param
37-
h.config["data_set.random_dataset"]["size"] = 20
38-
h.config["data_set.random_dataset"]["seed"] = 0
39-
h.config["data_set.random_dataset"]["shape"] = [2, 3]
40-
h.config["data_set.random_dataset"]["number_invalid_values"] = 40
41-
h.config["data_set.random_dataset"]["invalid_value_type"] = "nan"
37+
h.config["data_set"]["HyraxRandomDataset"]["size"] = 20
38+
h.config["data_set"]["HyraxRandomDataset"]["seed"] = 0
39+
h.config["data_set"]["HyraxRandomDataset"]["shape"] = [2, 3]
40+
h.config["data_set"]["HyraxRandomDataset"]["number_invalid_values"] = 40
41+
h.config["data_set"]["HyraxRandomDataset"]["invalid_value_type"] = "nan"
4242

4343
h.config["data_set"]["validate_size"] = 0.2
4444
h.config["data_set"]["test_size"] = 0.2

0 commit comments

Comments
 (0)