Skip to content

Embedding layer #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
a684ac4
work in progress
ravinkohli Feb 8, 2021
a6a8471
in progress
ravinkohli Feb 8, 2021
2b0c0e0
Working network embedding
ravinkohli Feb 15, 2021
9be86a5
ADD tests for network embedding
ravinkohli Feb 15, 2021
1adc9a4
Removed ordinal encoder
ravinkohli Feb 15, 2021
ae6bb44
Removed ordinal encoder
ravinkohli Feb 15, 2021
8783240
Add seed for test_losses for reproducibility
ravinkohli Feb 15, 2021
761fb75
Addressed comments
ravinkohli Feb 17, 2021
1e1a7e9
Fix merge conflicts
ravinkohli Feb 22, 2021
4bcbd88
fix flake
ravinkohli Feb 22, 2021
adcfc75
Merge refactor_development
ravinkohli Feb 23, 2021
d6d3dcd
Merge branch 'ravinkohli-embedding_layer' into embedding_layer
ravinkohli Feb 23, 2021
aa83d6d
fix test import training
ravinkohli Feb 23, 2021
cbc7e09
ADD_109
franchuterivera Feb 23, 2021
9dd447f
No print allow
franchuterivera Feb 23, 2021
20a874f
Fix tests and move to boston
ravinkohli Feb 23, 2021
7eed312
Debug issue with python 3.6
ravinkohli Feb 25, 2021
cb3b398
Debug for python3.6
ravinkohli Feb 26, 2021
007be7d
Run only debug file
ravinkohli Feb 26, 2021
d406b2a
Merge branch 'embedding_layer' of https://github.com/ravinkohli/Auto-…
ravinkohli Feb 26, 2021
b1e25d2
work in progress
ravinkohli Feb 8, 2021
f41eae1
in progress
ravinkohli Feb 8, 2021
6222399
Working network embedding
ravinkohli Feb 15, 2021
b96e32a
ADD tests for network embedding
ravinkohli Feb 15, 2021
ece6353
Removed ordinal encoder
ravinkohli Feb 15, 2021
70b0a79
Removed ordinal encoder
ravinkohli Feb 15, 2021
48d7a85
Addressed comments
ravinkohli Feb 17, 2021
a92fcaf
fix flake
ravinkohli Feb 22, 2021
a11ee8e
fix test import training
ravinkohli Feb 23, 2021
789bd8d
Fix tests and move to boston
ravinkohli Feb 23, 2021
85e178f
Debug issue with python 3.6
ravinkohli Feb 25, 2021
ddf198b
Run only debug file
ravinkohli Feb 26, 2021
a073f06
Debug for python3.6
ravinkohli Feb 26, 2021
e4599e9
Merge from origin
ravinkohli Feb 26, 2021
e625ee7
print paths of parent dir
ravinkohli Feb 26, 2021
9164bc2
Trying to run examples
ravinkohli Feb 26, 2021
f1beb14
Trying to run examples
ravinkohli Feb 27, 2021
af17afc
Add success model
ravinkohli Mar 1, 2021
d64e4fd
Added parent directory for printing paths
ravinkohli Mar 1, 2021
1602933
Try no autouse
franchuterivera Mar 1, 2021
c8d98ba
print log file to see if backend is saving num run
ravinkohli Mar 1, 2021
76fcd76
Setup logger in backend
ravinkohli Mar 1, 2021
ffc1620
handle nans in categorical columns (#118)
ravinkohli Mar 1, 2021
55ec853
Merge pull request #112 from franchuterivera/refactor_development_ADD…
ravinkohli Mar 1, 2021
3f39f58
try without embeddings
ravinkohli Mar 1, 2021
715d277
work in progress
ravinkohli Feb 8, 2021
d68a391
in progress
ravinkohli Feb 8, 2021
02dc064
Working network embedding
ravinkohli Feb 15, 2021
37cd8c5
ADD tests for network embedding
ravinkohli Feb 15, 2021
a3c1625
Removed ordinal encoder
ravinkohli Feb 15, 2021
b8896ad
Removed ordinal encoder
ravinkohli Feb 15, 2021
e0bfb0b
Addressed comments
ravinkohli Feb 17, 2021
23f6777
fix flake
ravinkohli Feb 22, 2021
4c1f33f
fix test import training
ravinkohli Feb 23, 2021
8c4233c
Fix tests and move to boston
ravinkohli Feb 23, 2021
18b5771
Debug issue with python 3.6
ravinkohli Feb 25, 2021
d839b5d
Run only debug file
ravinkohli Feb 26, 2021
de1d4c3
Debug for python3.6
ravinkohli Feb 26, 2021
e0a488a
work in progress
ravinkohli Feb 8, 2021
7368908
in progress
ravinkohli Feb 8, 2021
00789ac
Working network embedding
ravinkohli Feb 15, 2021
6a02fe4
ADD tests for network embedding
ravinkohli Feb 15, 2021
3f7c2cc
print paths of parent dir
ravinkohli Feb 26, 2021
f765347
Trying to run examples
ravinkohli Feb 26, 2021
6ad8550
Trying to run examples
ravinkohli Feb 27, 2021
011c0ef
Add success model
ravinkohli Mar 1, 2021
1efc39a
Added parent directory for printing paths
ravinkohli Mar 1, 2021
3d54db8
print log file to see if backend is saving num run
ravinkohli Mar 1, 2021
6c5e8be
Setup logger in backend
ravinkohli Mar 1, 2021
8734384
try without embeddings
ravinkohli Mar 1, 2021
8941c95
no embedding for python 3.6
ravinkohli Mar 2, 2021
6de97e1
merge
ravinkohli Mar 2, 2021
5aec1e1
Deleted debug example
ravinkohli Mar 2, 2021
36ae93c
Fix test for evaluation
ravinkohli Mar 2, 2021
c9ef56e
Deleted utils file
ravinkohli Mar 4, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,5 @@ jobs:
- name: Run tests
run: |
python examples/example_tabular_classification.py
python examples/example_image_classification.py
python examples/example_tabular_regression.py
python examples/example_image_classification.py
79 changes: 41 additions & 38 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import sklearn.utils
from sklearn import preprocessing
from sklearn.base import BaseEstimator
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import NotFittedError

from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
Expand Down Expand Up @@ -53,16 +53,34 @@ def _fit(
for column in X.columns:
if X[column].isna().all():
X[column] = pd.to_numeric(X[column])
# Also note this change in self.dtypes
if len(self.dtypes) != 0:
self.dtypes[list(X.columns).index(column)] = X[column].dtype

self.enc_columns, self.feat_type = self._get_columns_to_encode(X)

if len(self.enc_columns) > 0:

self.encoder = make_column_transformer(
(preprocessing.OrdinalEncoder(
handle_unknown='use_encoded_value',
unknown_value=-1,
), self.enc_columns),
# impute missing values before encoding,
# remove once sklearn natively supports
# it in ordinal encoding. Sklearn issue:
# "https://github.com/scikit-learn/scikit-learn/issues/17123)"
for column in self.enc_columns:
if X[column].isna().any():
missing_value: typing.Union[int, str] = -1
# make sure for a string column we give
# string missing value else we give numeric
if type(X[column][0]) == str:
missing_value = str(missing_value)
X[column] = X[column].cat.add_categories([missing_value])
X[column] = X[column].fillna(missing_value)

self.encoder = ColumnTransformer(
[
("encoder",
preprocessing.OrdinalEncoder(
handle_unknown='use_encoded_value',
unknown_value=-1,
), self.enc_columns)],
remainder="passthrough"
)

Expand All @@ -85,6 +103,7 @@ def comparator(cmp1: str, cmp2: str) -> int:
return 1
else:
raise ValueError((cmp1, cmp2))

self.feat_type = sorted(
self.feat_type,
key=functools.cmp_to_key(comparator)
Expand Down Expand Up @@ -182,9 +201,8 @@ def _check_data(
if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X):
raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
" scipy sparse and Python Lists, yet, the provided input is"
" of type {}".format(
type(X)
))
" of type {}".format(type(X))
)

if self.data_type is None:
self.data_type = type(X)
Expand Down Expand Up @@ -217,39 +235,25 @@ def _check_data(
# per estimator
enc_columns, _ = self._get_columns_to_encode(X)

if len(enc_columns) > 0:
if np.any(pd.isnull(
X[enc_columns].dropna( # type: ignore[call-overload]
axis='columns', how='all')
)):
# Ignore all NaN columns, and if still a NaN
# Error out
raise ValueError("Categorical features in a dataframe cannot contain "
"missing/NaN values. The OrdinalEncoder used by "
"AutoPyTorch cannot handle this yet (due to a "
"limitation on scikit-learn being addressed via: "
"https://github.com/scikit-learn/scikit-learn/issues/17123)"
)
column_order = [column for column in X.columns]
if len(self.column_order) > 0:
if self.column_order != column_order:
raise ValueError("Changing the column order of the features after fit() is "
"not supported. Fit() method was called with "
"{} whereas the new features have {} as type".format(
self.column_order,
column_order,
))
"{} whereas the new features have {} as type".format(self.column_order,
column_order,)
)
else:
self.column_order = column_order
dtypes = [dtype.name for dtype in X.dtypes]
if len(self.dtypes) > 0:
if self.dtypes != dtypes:
raise ValueError("Changing the dtype of the features after fit() is "
"not supported. Fit() method was called with "
"{} whereas the new features have {} as type".format(
self.dtypes,
dtypes,
))
"{} whereas the new features have {} as type".format(self.dtypes,
dtypes,
)
)
else:
self.dtypes = dtypes

Expand Down Expand Up @@ -294,7 +298,8 @@ def _get_columns_to_encode(
"pandas.Series.astype ."
"If working with string objects, the following "
"tutorial illustrates how to work with text data: "
"https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( # noqa: E501
"https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format(
# noqa: E501
column,
)
)
Expand Down Expand Up @@ -349,15 +354,13 @@ def list_to_dataframe(
# If a list was provided, it will be converted to pandas
X_train = pd.DataFrame(data=X_train).infer_objects()
self.logger.warning("The provided feature types to AutoPyTorch are of type list."
"Features have been interpreted as: {}".format(
[(col, t) for col, t in zip(X_train.columns, X_train.dtypes)]
))
"Features have been interpreted as: {}".format([(col, t) for col, t in
zip(X_train.columns, X_train.dtypes)]))
if X_test is not None:
if not isinstance(X_test, list):
self.logger.warning("Train features are a list while the provided test data"
"is {}. X_test will be casted as DataFrame.".format(
type(X_test)
))
"is {}. X_test will be casted as DataFrame.".format(type(X_test))
)
X_test = pd.DataFrame(data=X_test).infer_objects()
return X_train, X_test

Expand Down
3 changes: 3 additions & 0 deletions autoPyTorch/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,8 @@ def __init__(self, backend: Backend,
name=logger_name,
port=logger_port,
)
self.backend.setup_logger(name=logger_name, port=logger_port)

self.Y_optimization: Optional[np.ndarray] = None
self.Y_actual_train: Optional[np.ndarray] = None
self.pipelines: Optional[List[BaseEstimator]] = None
Expand Down Expand Up @@ -538,6 +540,7 @@ def file_output(
else:
pipeline = None

self.logger.debug("Saving directory {}, {}, {}".format(self.seed, self.num_run, self.budget))
self.backend.save_numrun_to_dir(
seed=int(self.seed),
idx=int(self.num_run),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

import torch
Expand Down Expand Up @@ -57,9 +57,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
if len(X['dataset_properties']['categorical_columns']):
categorical_pipeline = make_pipeline(*preprocessors['categorical'])

self.preprocessor = make_column_transformer(
(numerical_pipeline, X['dataset_properties']['numerical_columns']),
(categorical_pipeline, X['dataset_properties']['categorical_columns']),
self.preprocessor = ColumnTransformer([
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']),
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])],
remainder='passthrough'
)

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def get_hyperparameter_search_space(self,
raise ValueError("no encoders found, please add a encoder")

if default is None:
defaults = ['OneHotEncoder', 'OrdinalEncoder', 'NoEncoder']
defaults = ['OneHotEncoder', 'NoEncoder']
for default_ in defaults:
if default_ in available_preprocessors:
if include is not None and default_ not in include:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(
self.add_fit_requirements([
FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False),
FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False),
FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False),
])
self.final_activation = None

Expand All @@ -47,7 +48,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent:
# information to fit this stage
self.check_requirements(X, y)

self.network = torch.nn.Sequential(X['network_backbone'], X['network_head'])
self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head'])

# Properly set the network training device
if self.device is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from autoPyTorch.pipeline.components.base_component import (
autoPyTorchComponent,
)
from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape
from autoPyTorch.utils.common import FitRequirement


Expand All @@ -31,7 +32,9 @@ def __init__(self,
FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
dataset_property=False),
FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False)])
FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False)
])
self.backbone: nn.Module = None
self.config = kwargs
self.input_shape: Optional[Iterable] = None
Expand All @@ -56,6 +59,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
column_transformer = X['tabular_transformer'].preprocessor
input_shape = column_transformer.transform(X_train[:1]).shape[1:]

input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape)
self.input_shape = input_shape

self.backbone = self.build_backbone(
Expand Down
Loading