-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathoptimize_run_pipeline_categorical_features.py
66 lines (59 loc) · 2.38 KB
/
optimize_run_pipeline_categorical_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from niaaml import Pipeline
from niaaml.classifiers import MultiLayerPerceptron
from niaaml.preprocessing.feature_selection import VarianceThreshold
from niaaml.preprocessing.feature_transform import Normalizer
from niaaml.data import CSVDataReader
from niaaml.preprocessing.encoding import encode_categorical_features
import os
import numpy
import pandas
"""
This example presents how to use the Pipeline class individually. You may use this if you want to test out a specific classification pipeline.
We use a dataset that contains categorical and numerical features.
"""
# prepare data reader using csv file
data_reader = CSVDataReader(
src=os.path.dirname(os.path.abspath(__file__))
+ "/example_files/dataset_categorical.csv",
has_header=False,
contains_classes=True,
)
# we use the utility method encode_categorical_features to get encoders for the categorical features, but you may instantiate and fit
# feature encoders separately and pass them as an array (as long as they are implemented as this framework suggests)
# there should be as many encoders as categorical features
# this example uses One-Hot Encoding
_, encoders = encode_categorical_features(data_reader.get_x(), "OneHotEncoder")
# instantiate a Pipeline object
pipeline = Pipeline(
feature_selection_algorithm=VarianceThreshold(),
feature_transform_algorithm=Normalizer(),
classifier=MultiLayerPerceptron(),
categorical_features_encoders=encoders,
)
# run pipeline optimization process (returns fitness value, but sets the best parameters for classifier, feature selection algorithm and feature transform algorithm during the process)
pipeline.optimize(
data_reader.get_x(),
data_reader.get_y(),
10,
50,
"ParticleSwarmAlgorithm",
"Accuracy",
)
# run the pipeline using dummy data
# you could run the pipeline before the optimization process, but get wrong predictions as nothing in the pipeline is fit for the given dataset
predicted = pipeline.run(
pandas.DataFrame(
[
[
10.32440339,
3.195964543,
1.215275549,
3.741461311,
11.6736581,
6.435247906,
"a",
]
]
)
)
# pipeline variable contains a Pipeline object that can be used for further classification, exported as an object (that can later be loaded and used) or exported as a text file