Skip to content

Commit d33388b

Browse files
authored
Feature preprocessors, Loss strategies (#86)
* ADD Weighted loss * Now? * Fix tests, flake, mypy * Fix tests * Fix mypy * change back sklearn requirement * Assert for fast ica sklearn bug * Forgot to add skip * Fix tests, changed num only data to float * removed fast ica * change num only dataset * Increased number of features in num only * Increase timeout for pytest * ADD tensorboard to requirement * Fix bug with small_preprocess * Fix bug in pytest execution * Fix tests * ADD error is raised if default not in include * Added dynamic search space for deciding n components in feature preprocessors, add test for pipeline include * Moved back to random configs in tabular test * Added floor and ceil and handling of logs * Fix flake * Remove TruncatedSVD from cs if num numerical ==1 * ADD flakyness to network accuracy test * fix flake * remove cla to pytest
1 parent c38b42e commit d33388b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1104
-186
lines changed

.github/workflows/pytest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
- name: Run tests
3030
run: |
3131
if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autoPyTorch --cov-report=xml'; fi
32-
python -m pytest -n 2 --timeout=600 --timeout-method=thread --dist load test -sv $codecov
32+
python -m pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test
3333
- name: Check for files left behind by test
3434
if: ${{ always() }}
3535
run: |

autoPyTorch/datasets/base_dataset.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class TransformSubset(Subset):
4848
4949
We achieve so by adding a train flag to the pytorch subset
5050
"""
51+
5152
def __init__(self, dataset: Dataset, indices: Sequence[int], train: bool) -> None:
5253
self.dataset = dataset
5354
self.indices = indices
@@ -371,3 +372,11 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
371372
'num_classes': self.num_classes,
372373
})
373374
return dataset_properties
375+
376+
def get_required_dataset_info(self) -> Dict[str, Any]:
377+
"""
378+
Returns a dictionary containing required dataset properties to instantiate a pipeline,
379+
"""
380+
info = {'output_type': self.output_type,
381+
'issparse': self.issparse}
382+
return info

autoPyTorch/datasets/tabular_dataset.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ def __init__(self, X: Union[np.ndarray, pd.DataFrame],
104104
# rather to have a performance through time on the test data
105105
if X_test is not None:
106106
X_test, self._test_data_types, _, _, _ = self.interpret_columns(X_test)
107-
108107
# Some quality checks on the data
109108
if self.data_types != self._test_data_types:
110109
raise ValueError(f"The train data inferred types {self.data_types} are "
@@ -205,8 +204,7 @@ def interpret_columns(self,
205204

206205
return data, data_types, nan_mask, itovs, vtois
207206

208-
def infer_dataset_properties(self, X: Any) \
209-
-> Tuple[List[int], List[int], List[object], int]:
207+
def infer_dataset_properties(self, X: Any) -> Tuple[List[int], List[int], List[object], int]:
210208
"""
211209
Infers the properties of the dataset like
212210
categorical_columns, numerical_columns, categories, num_features
@@ -225,5 +223,16 @@ def infer_dataset_properties(self, X: Any) \
225223
numerical_columns.append(i)
226224
categories = [np.unique(X.iloc[:, a]).tolist() for a in categorical_columns]
227225
num_features = X.shape[1]
228-
229226
return categorical_columns, numerical_columns, categories, num_features
227+
228+
def get_required_dataset_info(self) -> Dict[str, Any]:
229+
"""
230+
Returns a dictionary containing required dataset properties to instantiate a pipeline,
231+
"""
232+
info = super().get_required_dataset_info()
233+
info.update({
234+
'numerical_columns': self.numerical_columns,
235+
'categorical_columns': self.categorical_columns,
236+
'task_type': self.task_type
237+
})
238+
return info

autoPyTorch/evaluation/abstract_evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@
3232
from autoPyTorch.datasets.base_dataset import BaseDataset
3333
from autoPyTorch.datasets.tabular_dataset import TabularDataset
3434
from autoPyTorch.evaluation.utils import (
35-
convert_multioutput_multiclass_to_multilabel,
36-
subsampler
35+
convert_multioutput_multiclass_to_multilabel
3736
)
3837
from autoPyTorch.pipeline.base_pipeline import BasePipeline
3938
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
@@ -42,6 +41,7 @@
4241
get_metrics,
4342
)
4443
from autoPyTorch.utils.backend import Backend
44+
from autoPyTorch.utils.common import subsampler
4545
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
4646
from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger
4747
from autoPyTorch.utils.pipeline import get_dataset_requirements

autoPyTorch/evaluation/train_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717
AbstractEvaluator,
1818
fit_and_suppress_warnings
1919
)
20-
from autoPyTorch.evaluation.utils import subsampler
2120
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
2221
from autoPyTorch.utils.backend import Backend
22+
from autoPyTorch.utils.common import subsampler
2323
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
2424

2525
__all__ = ['TrainEvaluator', 'eval_function']

autoPyTorch/evaluation/utils.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44

55
import numpy as np
66

7-
import pandas as pd
8-
97
from smac.runhistory.runhistory import RunValue
108

119
__all__ = [
@@ -16,12 +14,6 @@
1614
]
1715

1816

19-
def subsampler(data: Union[np.ndarray, pd.DataFrame],
20-
x: Union[np.ndarray, List[int]]
21-
) -> Union[np.ndarray, pd.DataFrame]:
22-
return data[x] if isinstance(data, np.ndarray) else data.iloc[x]
23-
24-
2517
def read_queue(queue_: Queue) -> List[RunValue]:
2618
stack: List[RunValue] = []
2719
while True:

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
autoPyTorchTabularPreprocessingComponent
1212
)
1313
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers
14-
from autoPyTorch.utils.common import FitRequirement
14+
from autoPyTorch.utils.common import FitRequirement, subsampler
1515

1616

1717
class TabularColumnTransformer(autoPyTorchTabularPreprocessingComponent):
@@ -48,7 +48,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
4848
"TabularColumnTransformer": an instance of self
4949
"""
5050
self.check_requirements(X, y)
51-
5251
numerical_pipeline = 'drop'
5352
categorical_pipeline = 'drop'
5453

@@ -67,11 +66,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
6766
# Where to get the data -- Prioritize X_train if any else
6867
# get from backend
6968
if 'X_train' in X:
70-
X_train = X['X_train']
69+
X_train = subsampler(X['X_train'], X['train_indices'])
7170
else:
7271
X_train = X['backend'].load_datamanager().train_tensors[0]
73-
self.preprocessor.fit(X_train)
7472

73+
self.preprocessor.fit(X_train)
7574
return self
7675

7776
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ def get_hyperparameter_search_space(self,
7878
# add only no encoder to choice hyperparameters in case the dataset is only numerical
7979
if len(dataset_properties['categorical_columns']) == 0:
8080
default = 'NoEncoder'
81+
if include is not None and default not in include:
82+
raise ValueError("Provided {} in include, however, the dataset "
83+
"is incompatible with it".format(include))
8184
preprocessor = CSH.CategoricalHyperparameter('__choice__',
8285
['NoEncoder'],
8386
default_value=default)
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from math import ceil, floor
2+
from typing import Any, Dict, Optional, Tuple, Union
3+
4+
from ConfigSpace.conditions import EqualsCondition, InCondition
5+
from ConfigSpace.configuration_space import ConfigurationSpace
6+
from ConfigSpace.hyperparameters import (
7+
CategoricalHyperparameter,
8+
UniformFloatHyperparameter,
9+
UniformIntegerHyperparameter,
10+
)
11+
12+
import numpy as np
13+
14+
import sklearn.decomposition
15+
from sklearn.base import BaseEstimator
16+
17+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.\
18+
base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
19+
from autoPyTorch.utils.common import FitRequirement
20+
21+
22+
class KernelPCA(autoPyTorchFeaturePreprocessingComponent):
23+
def __init__(self, n_components: int = 10,
24+
kernel: str = 'rbf', degree: int = 3,
25+
gamma: float = 0.01, coef0: float = 0.0,
26+
random_state: Optional[Union[int, np.random.RandomState]] = None
27+
) -> None:
28+
self.n_components = n_components
29+
self.kernel = kernel
30+
self.degree = degree
31+
self.gamma = gamma
32+
self.coef0 = coef0
33+
self.random_state = random_state
34+
super().__init__()
35+
36+
self.add_fit_requirements([
37+
FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])
38+
39+
def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
40+
41+
self.preprocessor['numerical'] = sklearn.decomposition.KernelPCA(
42+
n_components=self.n_components, kernel=self.kernel,
43+
degree=self.degree, gamma=self.gamma, coef0=self.coef0,
44+
remove_zero_eig=True, random_state=self.random_state)
45+
46+
return self
47+
48+
@staticmethod
49+
def get_hyperparameter_search_space(
50+
dataset_properties: Optional[Dict[str, str]] = None,
51+
n_components: Tuple[Tuple, float] = ((0.5, 0.9), 0.5),
52+
kernel: Tuple[Tuple, str] = (('poly', 'rbf', 'sigmoid', 'cosine'), 'rbf'),
53+
gamma: Tuple[Tuple, float, bool] = ((3.0517578125e-05, 8), 0.01, True),
54+
degree: Tuple[Tuple, int] = ((2, 5), 3),
55+
coef0: Tuple[Tuple, float] = ((-1, 1), 0)
56+
) -> ConfigurationSpace:
57+
58+
if dataset_properties is not None:
59+
n_features = len(dataset_properties['numerical_columns'])
60+
n_components = ((floor(n_components[0][0] * n_features), ceil(n_components[0][1] * n_features)),
61+
ceil(n_components[1] * n_features))
62+
else:
63+
n_components = ((10, 2000), 100)
64+
65+
n_components = UniformIntegerHyperparameter(
66+
"n_components", lower=n_components[0][0], upper=n_components[0][1], default_value=n_components[1])
67+
kernel_hp = CategoricalHyperparameter('kernel', choices=kernel[0], default_value=kernel[1])
68+
gamma = UniformFloatHyperparameter(
69+
"gamma",
70+
lower=gamma[0][0], upper=gamma[0][1],
71+
log=gamma[2],
72+
default_value=gamma[1],
73+
)
74+
coef0 = UniformFloatHyperparameter("coef0", lower=coef0[0][0], upper=coef0[0][1], default_value=coef0[1])
75+
cs = ConfigurationSpace()
76+
cs.add_hyperparameters([n_components, kernel_hp, gamma, coef0])
77+
78+
if "poly" in kernel_hp.choices:
79+
degree = UniformIntegerHyperparameter('degree', lower=degree[0][0], upper=degree[0][1],
80+
default_value=degree[1])
81+
cs.add_hyperparameters([degree])
82+
degree_depends_on_poly = EqualsCondition(degree, kernel_hp, "poly")
83+
cs.add_conditions([degree_depends_on_poly])
84+
kernels = []
85+
if "sigmoid" in kernel_hp.choices:
86+
kernels.append("sigmoid")
87+
if "poly" in kernel_hp.choices:
88+
kernels.append("poly")
89+
coef0_condition = InCondition(coef0, kernel_hp, kernels)
90+
kernels = []
91+
if "rbf" in kernel_hp.choices:
92+
kernels.append("rbf")
93+
if "poly" in kernel_hp.choices:
94+
kernels.append("poly")
95+
gamma_condition = InCondition(gamma, kernel_hp, kernels)
96+
cs.add_conditions([coef0_condition, gamma_condition])
97+
return cs
98+
99+
@staticmethod
100+
def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
101+
return {'shortname': 'KernelPCA',
102+
'name': 'Kernel Principal Component Analysis',
103+
'handles_sparse': True
104+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from typing import Any, Dict, Optional, Union
2+
3+
import numpy as np
4+
5+
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.\
6+
base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent
7+
8+
9+
class NoFeaturePreprocessor(autoPyTorchFeaturePreprocessingComponent):
10+
"""
11+
Don't perform feature preprocessing on categorical features
12+
"""
13+
def __init__(self,
14+
random_state: Optional[Union[np.random.RandomState, int]] = None
15+
):
16+
super().__init__()
17+
self.random_state = random_state
18+
19+
def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchFeaturePreprocessingComponent:
20+
"""
21+
The fit function calls the fit function of the underlying model
22+
and returns the transformed array.
23+
Args:
24+
X (np.ndarray): input features
25+
y (Optional[np.ndarray]): input labels
26+
27+
Returns:
28+
instance of self
29+
"""
30+
self.check_requirements(X, y)
31+
32+
return self
33+
34+
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
35+
"""
36+
Adds the self into the 'X' dictionary and returns it.
37+
Args:
38+
X (Dict[str, Any]): 'X' dictionary
39+
40+
Returns:
41+
(Dict[str, Any]): the updated 'X' dictionary
42+
"""
43+
X.update({'feature_preprocessor': self.preprocessor})
44+
return X
45+
46+
@staticmethod
47+
def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]:
48+
return {
49+
'shortname': 'NoFeaturePreprocessing',
50+
'name': 'No Feature Preprocessing',
51+
'handles_sparse': True
52+
}

0 commit comments

Comments
 (0)