Skip to content

A transformer for generating singular or pairwise tensor product Bernstein polynomial features. #173

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1bc2111
Initial implementation of Bernstein polynomials.
alexshtf May 19, 2024
0e64884
Fixed a bug which disallowed passing NaN to bernstein transformers.
alexshtf May 19, 2024
aea1a79
Added documentation for the polynomial basis.
alexshtf May 19, 2024
b5141b9
Fixed documentation.
alexshtf May 19, 2024
8eb965f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2024
45f6e30
Add input array checks.
alexshtf May 19, 2024
236c63c
Merge remote-tracking branch 'origin/main'
alexshtf May 19, 2024
44bcfbf
Bugfix.
alexshtf May 19, 2024
6b15569
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2024
d5d7203
More bugfixes.
alexshtf May 19, 2024
cb77b09
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2024
280d0be
No need to validate order of X.
alexshtf May 19, 2024
d6d720e
Added estimator tags.
alexshtf May 19, 2024
23bbd53
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2024
457d76d
1D feature is now a column, rather than a row.
alexshtf May 19, 2024
3788138
Merge remote-tracking branch 'origin/main'
alexshtf May 19, 2024
457da1f
Removed the stateless flag.
alexshtf May 19, 2024
5f02b3a
Fixed test_correct_param_types() test.
alexshtf May 19, 2024
b8aa359
Fixed tests.
alexshtf May 19, 2024
6ffd8ec
More test fixes.
alexshtf May 19, 2024
2c14c29
fixed expected output in tests.
alexshtf May 19, 2024
27c310f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 19, 2024
15df21e
Remove redundant print statement.
alexshtf May 19, 2024
a03b4c8
Merge remote-tracking branch 'origin/main'
alexshtf May 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions sklearn_extra/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from ._polynomial_basis import (
PolynomialBasisTransformer,
BernsteinFeatures,
)

__all__ = [
"PolynomialBasisTransformer",
"BernsteinFeatures",
]
137 changes: 137 additions & 0 deletions sklearn_extra/preprocessing/_polynomial_basis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""BernsteinTransformer for generating polynomial features using the Bernstein basis."""

# Author: Alex Shtoff
# License: BSD 3 clause

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import (
FLOAT_DTYPES,
check_is_fitted,
check_scalar,
)
from itertools import combinations
from scipy.stats import binom


class PolynomialBasisTransformer(TransformerMixin, BaseEstimator):
"""
Polynomial basis transformer for generating polynomial features.

This transformer generates polynomial features of a given degree
for each data column separately, or tensor-product features for every
pair of columns if interactions=True.

Parameters
----------

degree : int, default=5
The degree of the polynomial basis.

bias : bool, default=False
If True, avoids generating the first basis function, assuming it
represents the constant term.

na_value : float, default=0.
The value to replace NaNs in the input data with.

interactions : bool, default=False
If True, generates tensor-product features for every pair of columns. If
bias=True, the product of the first two basis functions is excluded.


Notes
-----
Inheriting classes should override the `vandermonde_matrix` method to
generate the Vandermonde matrix of the concrete polynomial basis.


References
----------
[1] https://en.wikipedia.org/wiki/Vandermonde_matrix
"""

def __init__(self, degree=5, bias=False, na_value=0.0, interactions=False):
self.degree = degree
self.bias = bias
self.na_value = na_value
self.interactions = interactions

def fit(self, X, y=None):
self.degree = check_scalar(self.degree, "degree", int, min_val=0)
self.bias = check_scalar(self.bias, "bias", bool)
self.na_value = check_scalar(self.na_value, "na_value", float)
self.interactions = check_scalar(
self.interactions, "interactions", bool
)
self._validate_data(X, force_all_finite="allow-nan")
self.is_fitted_ = True
return self

def transform(self, X):
check_is_fitted(self)

X = self._validate_data(
X,
dtype=FLOAT_DTYPES,
reset=False,
force_all_finite="allow-nan",
)

# Get the number of columns in the input array
n_rows, n_features = X.shape

# Compute the specific polynomial basis for each column
basis_features = [
self.feature_matrix(X[:, i]) for i in range(n_features)
]

# create interaction features - basis tensor products
if self.interactions:
interaction_features = [
(u[:, None, :] * v[:, :, None]).reshape(n_rows, -1)
for u, v in combinations(basis_features, 2)
]
result_basis = interaction_features
else:
result_basis = basis_features

# remove the first basis function, if no bias is required
if not self.bias:
result_basis = [basis[:, 1:] for basis in result_basis]

return np.hstack(result_basis)

def feature_matrix(self, column):
vander = self.vandermonde_matrix(column)
return np.nan_to_num(vander, self.na_value)

def vandermonde_matrix(self, column):
raise NotImplementedError("Subclasses must implement this method.")

def _get_tags(self):
base_tags = super()._get_tags()
return base_tags | {
"allow_nan": True,
"requires_y": False,
}


class BernsteinFeatures(PolynomialBasisTransformer):
"""
Polynomial basis transformer for generating polynomial features using the Bernstein basis.

See Also
--------
PolynomialBasisTransformer

References
----------
[1] https://en.wikipedia.org/wiki/Bernstein_polynomial
[2]: https://alexshtf.github.io/2024/02/11/Bernstein-Sklearn.html
"""

def vandermonde_matrix(self, column):
basis_idx = np.arange(1 + self.degree)
basis = binom.pmf(basis_idx, self.degree, column[:, None])
return basis
69 changes: 69 additions & 0 deletions sklearn_extra/preprocessing/tests/test_polynomial_basis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np
import pytest
from sklearn.utils.estimator_checks import parametrize_with_checks

from sklearn_extra.preprocessing import BernsteinFeatures

from sklearn.utils._testing import (
assert_array_almost_equal,
)

feature_1d = np.array([0, 0.5, 1, np.nan]).reshape(-1, 1)
feature_2d = np.array([[0, 0.25], [0.5, 0.5], [np.nan, 0.75]])


@parametrize_with_checks([BernsteinFeatures()])
def test_sklearn_compatibility(estimator, check):
check(estimator)


def test_correct_param_types():
with pytest.raises(TypeError):
BernsteinFeatures(na_value="a").fit(feature_1d)

with pytest.raises(ValueError):
BernsteinFeatures(degree=-1).fit(feature_1d)

with pytest.raises(TypeError):
BernsteinFeatures(degree="a").fit(feature_1d)

with pytest.raises(TypeError):
BernsteinFeatures(bias=-1).fit(feature_1d)

with pytest.raises(TypeError):
BernsteinFeatures(interactions=-1).fit(feature_1d)


def test_correct_output_one_feature():
bbt = BernsteinFeatures(degree=2).fit(feature_1d)
output = bbt.transform(feature_1d)
expected_output = np.array(
[[0.0, 0.0], [0.5, 0.25], [0.0, 1.0], [0.0, 0.0]]
)
assert_array_almost_equal(output, expected_output)


def test_correct_output_two_features():
bbt = BernsteinFeatures(degree=2).fit(feature_2d)
output = bbt.transform(feature_2d)
expected_output = np.array(
[
[0.0, 0.0, 0.375, 0.0625],
[0.5, 0.25, 0.5, 0.25],
[0.0, 0.0, 0.375, 0.5625],
]
)
assert_array_almost_equal(output, expected_output)


def test_correct_output_interactions():
bbt = BernsteinFeatures(degree=2, interactions=True).fit(feature_2d)
output = bbt.transform(feature_2d)
expected_output = np.array(
[
[0.0, 0.0, 0.375, 0.0, 0.0, 0.0625, 0.0, 0.0],
[0.125, 0.0625, 0.125, 0.25, 0.125, 0.0625, 0.125, 0.0625],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
]
)
assert_array_almost_equal(output, expected_output)
Loading