1
- from math import ceil
2
1
from typing import Any , Dict , List , Optional , Union
3
2
4
3
from ConfigSpace .configuration_space import ConfigurationSpace
5
4
from ConfigSpace .hyperparameters import (
5
+ CategoricalHyperparameter ,
6
6
UniformFloatHyperparameter ,
7
+ UniformIntegerHyperparameter ,
7
8
)
8
9
9
10
import numpy as np
16
17
from autoPyTorch .utils .common import HyperparameterSearchSpace , add_hyperparameter
17
18
18
19
20
+ def get_num_output_dimensions (config : Dict [str , Any ], num_categs_per_feature : List [int ]) -> List [int ]:
21
+ """
22
+ Returns list of embedding sizes for each categorical variable.
23
+ Selects this adaptively based on training_datset.
24
+ Note: Assumes there is at least one embed feature.
25
+
26
+ Args:
27
+ config (Dict[str, Any]):
28
+ contains the hyperparameters required to calculate the `num_output_dimensions`
29
+ num_categs_per_feature (List[int]):
30
+ list containing number of categories for each feature that is to be embedded,
31
+ 0 if the column is not an embed column
32
+
33
+ Returns:
34
+ List[int]:
35
+ list containing the output embedding size for each column,
36
+ 1 if the column is not an embed column
37
+ """
38
+
39
+ max_embedding_dim = config ['max_embedding_dim' ]
40
+ embed_exponent = config ['embed_exponent' ]
41
+ size_factor = config ['embedding_size_factor' ]
42
+ num_output_dimensions = [int (size_factor * max (
43
+ 2 ,
44
+ min (max_embedding_dim ,
45
+ 1.6 * num_categories ** embed_exponent )))
46
+ if num_categories > 0 else 1 for num_categories in num_categs_per_feature ]
47
+ return num_output_dimensions
48
+
49
+
19
50
class _LearnedEntityEmbedding (nn .Module ):
20
51
""" Learned entity embedding module for categorical features"""
21
52
@@ -35,9 +66,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n
35
66
36
67
self .num_embed_features = self .num_categories_per_col [self .embed_features ]
37
68
38
- self .num_output_dimensions = [1 ] * num_features_excl_embed
39
- self .num_output_dimensions .extend ([ceil (config ["dimension_reduction_" + str (i )] * num_in ) for i , num_in in
40
- enumerate (self .num_embed_features )])
69
+ self .num_output_dimensions = get_num_output_dimensions (config , self .num_categories_per_col )
41
70
42
71
self .num_out_feats = num_features_excl_embed + sum (self .num_output_dimensions )
43
72
@@ -48,12 +77,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
48
77
# before passing it through the model
49
78
concat_seq = []
50
79
51
- x_pointer = 0
52
80
layer_pointer = 0
53
81
for x_pointer , embed in enumerate (self .embed_features ):
54
82
current_feature_slice = x [:, x_pointer ]
55
83
if not embed :
56
- x_pointer += 1
57
84
concat_seq .append (current_feature_slice .view (- 1 , 1 ))
58
85
continue
59
86
current_feature_slice = current_feature_slice .to (torch .int )
@@ -91,28 +118,24 @@ def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_
91
118
@staticmethod
92
119
def get_hyperparameter_search_space (
93
120
dataset_properties : Optional [Dict [str , BaseDatasetPropertiesType ]] = None ,
94
- dimension_reduction : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "dimension_reduction" ,
95
- value_range = (0 , 1 ),
96
- default_value = 0.5 ),
121
+ embed_exponent : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "embed_exponent" ,
122
+ value_range = (0.56 ,),
123
+ default_value = 0.56 ),
124
+ max_embedding_dim : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "max_embedding_dim" ,
125
+ value_range = (100 ,),
126
+ default_value = 100 ),
127
+ embedding_size_factor : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "embedding_size_factor" ,
128
+ value_range = (0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1.0 , 1.1 , 1.2 , 1.3 , 1.4 , 1.5 ),
129
+ default_value = 1 ,
130
+ ),
97
131
) -> ConfigurationSpace :
98
132
cs = ConfigurationSpace ()
99
133
if dataset_properties is not None :
100
- for i in range (len (dataset_properties ['categorical_columns' ])
101
- if isinstance (dataset_properties ['categorical_columns' ], List ) else 0 ):
102
- # currently as we dont have information about the embedding columns
103
- # we search for more dimensions than necessary. This can be solved by
104
- # not having `min_unique_values_for_embedding` as a hyperparameter and
105
- # instead passing it as a parameter to the feature validator, which
106
- # allows us to pass embed_columns to the dataset properties.
107
- # TODO: test the trade off
108
- # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding`
109
- # in one custom transformer. this will also allow users to use this transformer
110
- # outside the pipeline
111
- ee_dimensions_search_space = HyperparameterSearchSpace (hyperparameter = "dimension_reduction_" + str (i ),
112
- value_range = dimension_reduction .value_range ,
113
- default_value = dimension_reduction .default_value ,
114
- log = dimension_reduction .log )
115
- add_hyperparameter (cs , ee_dimensions_search_space , UniformFloatHyperparameter )
134
+ if len (dataset_properties ['categorical_columns' ]) > 0 :
135
+ add_hyperparameter (cs , embed_exponent , UniformFloatHyperparameter )
136
+ add_hyperparameter (cs , max_embedding_dim , UniformIntegerHyperparameter )
137
+ add_hyperparameter (cs , embedding_size_factor , CategoricalHyperparameter )
138
+
116
139
return cs
117
140
118
141
@staticmethod
0 commit comments