Skip to content

Check array index fix #320

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
14 changes: 2 additions & 12 deletions category_encoders/cat_boost.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,7 @@ def fit(self, X, y, **kwargs):
"""

# unite the input into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)

if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
X, y = util.convert_inputs(X, y)

self._dim = X.shape[1]

Expand Down Expand Up @@ -198,18 +194,12 @@ def transform(self, X, y=None, override_return_df=False):
raise ValueError('Must train encoder before it can be used to transform data.')

# unite the input into pandas types
X = util.convert_input(X)
X, y = util.convert_inputs(X, y)

# then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

# if we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index).astype(float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

if not list(self.cols):
return X
X = self._transform(
Expand Down
19 changes: 3 additions & 16 deletions category_encoders/glmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,8 @@ def fit(self, X, y, **kwargs):
"""

# Unite parameters into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)

# The lengths must be equal
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
X, y = util.convert_inputs(X, y)
y = y.astype(float)

self._dim = X.shape[1]

Expand Down Expand Up @@ -209,24 +205,15 @@ def transform(self, X, y=None, override_return_df=False):
raise ValueError('Must train encoder before it can be used to transform data.')

# Unite the input into pandas types
X = util.convert_input(X)
X, y = util.convert_inputs(X, y, deep=True)

# Then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

# If we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index).astype(float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

if not list(self.cols):
return X

# Do not modify the input argument
X = X.copy(deep=True)

X = self.ordinal_encoder.transform(X)

if self.handle_unknown == 'error':
Expand Down
18 changes: 2 additions & 16 deletions category_encoders/james_stein.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,7 @@ def fit(self, X, y, **kwargs):
"""

# Unite parameters into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)

# The lengths must be equal
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
X, y = util.convert_inputs(X, y)

self._dim = X.shape[1]

Expand Down Expand Up @@ -273,24 +268,15 @@ def transform(self, X, y=None, override_return_df=False):
raise ValueError('Must train encoder before it can be used to transform data.')

# Unite the input into pandas types
X = util.convert_input(X)
X, y = util.convert_inputs(X, y, deep=True)

# Then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

# If we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index).astype(float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

if not list(self.cols):
return X

# Do not modify the input argument
X = X.copy(deep=True)

X = self.ordinal_encoder.transform(X)

if self.handle_unknown == 'error':
Expand Down
20 changes: 6 additions & 14 deletions category_encoders/leave_one_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,8 @@ def fit(self, X, y, **kwargs):
"""

# unite the input into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)

if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
X, y = util.convert_inputs(X, y)
y = y.astype(float)

self._dim = X.shape[1]

Expand Down Expand Up @@ -182,18 +179,14 @@ def transform(self, X, y=None, override_return_df=False):
raise ValueError('Must train encoder before it can be used to transform data.')

# unite the input into pandas types
X = util.convert_input(X)
X, y = util.convert_inputs(X, y, deep=True)
if y is not None:
y = y.astype(float)

# then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

# if we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index).astype(float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

if not list(self.cols):
return X
X = self.transform_leave_one_out(
Expand Down Expand Up @@ -234,12 +227,11 @@ def fit_column_map(self, series, y):
result = y.groupby(codes).agg(['sum', 'count'])
return result.rename(return_map)

def transform_leave_one_out(self, X_in, y, mapping=None):
def transform_leave_one_out(self, X, y, mapping=None):
"""
Leave one out encoding uses a single column of floats to represent the means of the target variables.
"""

X = X_in.copy(deep=True)
random_state_ = check_random_state(self.random_state)

for col, colmap in mapping.items():
Expand Down
18 changes: 2 additions & 16 deletions category_encoders/m_estimate.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,7 @@ def fit(self, X, y, **kwargs):
"""

# Unite parameters into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)

# The lengths must be equal
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
X, y = util.convert_inputs(X, y)

self._dim = X.shape[1]

Expand Down Expand Up @@ -205,24 +200,15 @@ def transform(self, X, y=None, override_return_df=False):
raise ValueError('Must train encoder before it can be used to transform data.')

# Unite the input into pandas types
X = util.convert_input(X)
X, y = util.convert_inputs(X, y, deep=True)

# Then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

# If we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index).astype(float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

if not list(self.cols):
return X

# Do not modify the input argument
X = X.copy(deep=True)

X = self.ordinal_encoder.transform(X)

if self.handle_unknown == 'error':
Expand Down
30 changes: 5 additions & 25 deletions category_encoders/quantile_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,8 @@ def fit(self, X, y, **kwargs):
"""

# unite the input into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)

if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
X, y = util.convert_inputs(X, y)
y = y.astype(float)

self._dim = X.shape[1]

Expand Down Expand Up @@ -220,7 +217,7 @@ def transform(self, X, y=None, override_return_df=False):
raise ValueError("Must train encoder before it can be used to transform data.")

# unite the input into pandas types
X = util.convert_input(X)
X, y = util.convert_inputs(X, y)

# then make sure that it is the right size
if X.shape[1] != self._dim:
Expand All @@ -232,14 +229,6 @@ def transform(self, X, y=None, override_return_df=False):
)
)

# if we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index)
if X.shape[0] != y.shape[0]:
raise ValueError(
"The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + "."
)

if not list(self.cols):
return X

Expand Down Expand Up @@ -376,8 +365,7 @@ def __init__(
self.encoder_list = None

def fit(self, X, y):
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index).astype(float)
X, y = util.convert_inputs(X, y)

if self.cols is None:
self.cols = util.get_obj_cols(X)
Expand Down Expand Up @@ -418,15 +406,7 @@ def fit(self, X, y):
def transform(self, X, y=None, override_return_df=False):
if self.encoder_list is None:
raise ValueError("Must train encoder before it can be used to transform data.")
X = util.convert_input(X)

# if we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index)
if X.shape[0] != y.shape[0]:
raise ValueError(
"The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + "."
)
X, y = util.convert_inputs(X, y)

orig_cols = X.columns
transformed_df = X.copy()
Expand Down
14 changes: 2 additions & 12 deletions category_encoders/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,7 @@ def fit(self, X, y, **kwargs):
"""

# unite the input into pandas types
X = util.convert_input(X)
y = util.convert_input_vector(y, X.index)

if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
X, y = util.convert_inputs(X, y)

self._dim = X.shape[1]

Expand Down Expand Up @@ -213,18 +209,12 @@ def transform(self, X, y=None, override_return_df=False):
raise ValueError('Must train encoder before it can be used to transform data.')

# unite the input into pandas types
X = util.convert_input(X)
X, y = util.convert_inputs(X, y)

# then make sure that it is the right size
if X.shape[1] != self._dim:
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

# if we are encoding the training data, we have to check the target
if y is not None:
y = util.convert_input_vector(y, X.index)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

if not list(self.cols):
return X

Expand Down
48 changes: 43 additions & 5 deletions category_encoders/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,43 @@ def is_category(dtype):
return pd.api.types.is_categorical_dtype(dtype)


def convert_input(X, columns=None, deep=False):
def convert_inputs(X, y, columns=None, index=None, deep=False):
"""
Unite arraylike `X` and vectorlike `y` into a DataFrame and Series.

If both are pandas types already, raises an error if their indexes do not match.
If one is pandas, the returns will share that index.
If neither is pandas, a default index will be used, unless `index` is passed.

Parameters
----------
X: arraylike
y: listlike
columns: listlike
Specifies column names to use for `X`.
Ignored if `X` is already a dataframe.
If `None`, use the default pandas column names.
index: listlike
The index to use, if neither `X` nor `y` is a pandas type.
(If one has an index, then this has no effect.)
If `None`, use the default pandas index.
deep: bool
Whether to deep-copy `X`.
"""
X_alt_index = y.index if isinstance(y, pd.Series) else index
X = convert_input(X, columns=columns, deep=deep, index=X_alt_index)
if y is not None:
y = convert_input_vector(y, index=X.index)
# N.B.: If either was already pandas, it keeps its index.

if any(X.index != y.index):
raise ValueError("`X` and `y` both have indexes, but they do not match.")
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
return X, y


def convert_input(X, columns=None, deep=False, index=None):
"""
Unite data into a DataFrame.
Objects that do not contain column names take the names from the argument.
Expand All @@ -53,11 +89,11 @@ def convert_input(X, columns=None, deep=False):
if columns is not None and np.size(X,1) != len(columns):
raise ValueError('The count of the column names does not correspond to the count of the columns')
if isinstance(X, list):
X = pd.DataFrame(X, columns=columns, copy=deep) # lists are always copied, but for consistency, we still pass the argument
X = pd.DataFrame(X, columns=columns, copy=deep, index=index) # lists are always copied, but for consistency, we still pass the argument
elif isinstance(X, (np.generic, np.ndarray)):
X = pd.DataFrame(X, columns=columns, copy=deep)
X = pd.DataFrame(X, columns=columns, copy=deep, index=index)
elif isinstance(X, csr_matrix):
X = pd.DataFrame(X.todense(), columns=columns, copy=deep)
X = pd.DataFrame(X.todense(), columns=columns, copy=deep, index=index)
else:
raise ValueError('Unexpected input type: %s' % (str(type(X))))
elif deep:
Expand Down Expand Up @@ -88,8 +124,10 @@ def convert_input_vector(y, index):
elif np.isscalar(y):
return pd.Series([y], name='target', index=index)
elif isinstance(y, list):
if len(y)==0 or (len(y)>0 and not isinstance(y[0], list)): # empty list or a vector
if len(y)==0: # empty list
return pd.Series(y, name='target', index=index, dtype=float)
elif len(y)>0 and not isinstance(y[0], list): # vector
return pd.Series(y, name='target', index=index)
elif len(y)>0 and isinstance(y[0], list) and len(y[0])==1: # single row in a matrix
flatten = lambda y: [item for sublist in y for item in sublist]
return pd.Series(flatten(y), name='target', index=index)
Expand Down
Loading