Skip to content

Commit 3761b53

Browse files
authored
Reg cocktails apt1.0+reg cocktails pytorch embedding reduced (#454)
* reduce number of hyperparameters for pytorch embedding * remove todos for the preprocessing PR, and apply suggestion from code review * remove unwanted exclude in test
1 parent 6830116 commit 3761b53

File tree

5 files changed

+57
-38
lines changed

5 files changed

+57
-38
lines changed

autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from math import ceil
21
from typing import Any, Dict, List, Optional, Union
32

43
from ConfigSpace.configuration_space import ConfigurationSpace
54
from ConfigSpace.hyperparameters import (
5+
CategoricalHyperparameter,
66
UniformFloatHyperparameter,
7+
UniformIntegerHyperparameter,
78
)
89

910
import numpy as np
@@ -16,6 +17,36 @@
1617
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
1718

1819

20+
def get_num_output_dimensions(config: Dict[str, Any], num_categs_per_feature: List[int]) -> List[int]:
21+
"""
22+
Returns list of embedding sizes for each categorical variable.
23+
Selects this adaptively based on training_datset.
24+
Note: Assumes there is at least one embed feature.
25+
26+
Args:
27+
config (Dict[str, Any]):
28+
contains the hyperparameters required to calculate the `num_output_dimensions`
29+
num_categs_per_feature (List[int]):
30+
list containing number of categories for each feature that is to be embedded,
31+
0 if the column is not an embed column
32+
33+
Returns:
34+
List[int]:
35+
list containing the output embedding size for each column,
36+
1 if the column is not an embed column
37+
"""
38+
39+
max_embedding_dim = config['max_embedding_dim']
40+
embed_exponent = config['embed_exponent']
41+
size_factor = config['embedding_size_factor']
42+
num_output_dimensions = [int(size_factor*max(
43+
2,
44+
min(max_embedding_dim,
45+
1.6 * num_categories**embed_exponent)))
46+
if num_categories > 0 else 1 for num_categories in num_categs_per_feature]
47+
return num_output_dimensions
48+
49+
1950
class _LearnedEntityEmbedding(nn.Module):
2051
""" Learned entity embedding module for categorical features"""
2152

@@ -35,9 +66,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n
3566

3667
self.num_embed_features = self.num_categories_per_col[self.embed_features]
3768

38-
self.num_output_dimensions = [1] * num_features_excl_embed
39-
self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in
40-
enumerate(self.num_embed_features)])
69+
self.num_output_dimensions = get_num_output_dimensions(config, self.num_categories_per_col)
4170

4271
self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions)
4372

@@ -48,12 +77,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
4877
# before passing it through the model
4978
concat_seq = []
5079

51-
x_pointer = 0
5280
layer_pointer = 0
5381
for x_pointer, embed in enumerate(self.embed_features):
5482
current_feature_slice = x[:, x_pointer]
5583
if not embed:
56-
x_pointer += 1
5784
concat_seq.append(current_feature_slice.view(-1, 1))
5885
continue
5986
current_feature_slice = current_feature_slice.to(torch.int)
@@ -91,28 +118,24 @@ def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_
91118
@staticmethod
92119
def get_hyperparameter_search_space(
93120
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
94-
dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction",
95-
value_range=(0, 1),
96-
default_value=0.5),
121+
embed_exponent: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embed_exponent",
122+
value_range=(0.56,),
123+
default_value=0.56),
124+
max_embedding_dim: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_embedding_dim",
125+
value_range=(100,),
126+
default_value=100),
127+
embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor",
128+
value_range=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5),
129+
default_value=1,
130+
),
97131
) -> ConfigurationSpace:
98132
cs = ConfigurationSpace()
99133
if dataset_properties is not None:
100-
for i in range(len(dataset_properties['categorical_columns'])
101-
if isinstance(dataset_properties['categorical_columns'], List) else 0):
102-
# currently as we dont have information about the embedding columns
103-
# we search for more dimensions than necessary. This can be solved by
104-
# not having `min_unique_values_for_embedding` as a hyperparameter and
105-
# instead passing it as a parameter to the feature validator, which
106-
# allows us to pass embed_columns to the dataset properties.
107-
# TODO: test the trade off
108-
# Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding`
109-
# in one custom transformer. this will also allow users to use this transformer
110-
# outside the pipeline
111-
ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i),
112-
value_range=dimension_reduction.value_range,
113-
default_value=dimension_reduction.default_value,
114-
log=dimension_reduction.log)
115-
add_hyperparameter(cs, ee_dimensions_search_space, UniformFloatHyperparameter)
134+
if len(dataset_properties['categorical_columns']) > 0:
135+
add_hyperparameter(cs, embed_exponent, UniformFloatHyperparameter)
136+
add_hyperparameter(cs, max_embedding_dim, UniformIntegerHyperparameter)
137+
add_hyperparameter(cs, embedding_size_factor, CategoricalHyperparameter)
138+
116139
return cs
117140

118141
@staticmethod

autoPyTorch/pipeline/tabular_regression.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
2121
ColumnSplitter
2222
)
23+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import (
24+
ColumnSplitter
25+
)
2326
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import (
2427
EncoderChoice
2528
)

test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
)
1414

1515

16-
# TODO: fix in preprocessing PR
17-
# @pytest.mark.skip("Skipping tests as preprocessing is not finalised")
1816
@pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only',
1917
'classification_categorical_only',
2018
'classification_numerical_and_categorical'], indirect=True)

test/test_pipeline/components/setup/test_setup_networks.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ def head(request):
1919
return request.param
2020

2121

22-
# TODO: add 'LearnedEntityEmbedding' after preprocessing dix
2322
@pytest.fixture(params=['NoEmbedding', 'LearnedEntityEmbedding'])
2423
def embedding(request):
2524
return request.param

test/test_pipeline/test_tabular_regression.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,10 @@ def test_pipeline_fit(self, fit_dictionary_tabular):
6161
"""This test makes sure that the pipeline is able to fit
6262
given random combinations of hyperparameters across the pipeline"""
6363
# TODO: fix issue where adversarial also works for regression
64-
# TODO: Fix issue with learned entity embedding after preprocessing PR
64+
6565
pipeline = TabularRegressionPipeline(
6666
dataset_properties=fit_dictionary_tabular['dataset_properties'],
67-
exclude={'trainer': ['AdversarialTrainer'],
68-
'network_embedding': ['LearnedEntityEmbedding']})
67+
exclude={'trainer': ['AdversarialTrainer']})
6968
cs = pipeline.get_hyperparameter_search_space()
7069

7170
config = cs.sample_configuration()
@@ -91,8 +90,7 @@ def test_pipeline_predict(self, fit_dictionary_tabular):
9190
X = fit_dictionary_tabular['X_train'].copy()
9291
pipeline = TabularRegressionPipeline(
9392
dataset_properties=fit_dictionary_tabular['dataset_properties'],
94-
exclude={'trainer': ['AdversarialTrainer'],
95-
'network_embedding': ['LearnedEntityEmbedding']})
93+
exclude={'trainer': ['AdversarialTrainer']})
9694

9795
cs = pipeline.get_hyperparameter_search_space()
9896
config = cs.sample_configuration()
@@ -121,8 +119,7 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
121119

122120
pipeline = TabularRegressionPipeline(
123121
dataset_properties=fit_dictionary_tabular['dataset_properties'],
124-
exclude={'trainer': ['AdversarialTrainer'],
125-
'network_embedding': ['LearnedEntityEmbedding']})
122+
exclude={'trainer': ['AdversarialTrainer']})
126123
cs = pipeline.get_hyperparameter_search_space()
127124
config = cs.sample_configuration()
128125
pipeline.set_hyperparameters(config)
@@ -139,11 +136,10 @@ def test_pipeline_transform(self, fit_dictionary_tabular):
139136
assert fit_dictionary_tabular.items() <= transformed_fit_dictionary_tabular.items()
140137

141138
# Then the pipeline should have added the following keys
142-
# Removing 'imputer', 'encoder', 'scaler', these will be
143-
# TODO: added back after a PR fixing preprocessing
144139
expected_keys = {'tabular_transformer', 'preprocess_transforms', 'network',
145140
'optimizer', 'lr_scheduler', 'train_data_loader',
146-
'val_data_loader', 'run_summary', 'feature_preprocessor'}
141+
'val_data_loader', 'run_summary', 'feature_preprocessor',
142+
'imputer', 'encoder', 'scaler'}
147143
assert expected_keys.issubset(set(transformed_fit_dictionary_tabular.keys()))
148144

149145
# Then we need to have transformations being created.

0 commit comments

Comments
 (0)