scikit-learn-contrib · PaulWestenthanner · Oct 29, 2021 · Oct 24, 2021 · Oct 24, 2021 · Oct 24, 2021
diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py
@@ -130,11 +130,7 @@ def fit(self, X, y, **kwargs):
         """
 
         # unite the input into pandas types
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index).astype(float)
-
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+        X, y = util.convert_inputs(X, y)
 
         self._dim = X.shape[1]
 
@@ -198,18 +194,12 @@ def transform(self, X, y=None, override_return_df=False):
             raise ValueError('Must train encoder before it can be used to transform data.')
 
         # unite the input into pandas types
-        X = util.convert_input(X)
+        X, y = util.convert_inputs(X, y)
 
         # then make sure that it is the right size
         if X.shape[1] != self._dim:
             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 
-        # if we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index).astype(float)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
-
         if not list(self.cols):
             return X
         X = self._transform(

diff --git a/category_encoders/glmm.py b/category_encoders/glmm.py
@@ -133,12 +133,8 @@ def fit(self, X, y, **kwargs):
         """
 
         # Unite parameters into pandas types
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index).astype(float)
-
-        # The lengths must be equal
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+        X, y = util.convert_inputs(X, y)
+        y = y.astype(float)
 
         self._dim = X.shape[1]
 
@@ -209,24 +205,15 @@ def transform(self, X, y=None, override_return_df=False):
             raise ValueError('Must train encoder before it can be used to transform data.')
 
         # Unite the input into pandas types
-        X = util.convert_input(X)
+        X, y = util.convert_inputs(X, y, deep=True)
 
         # Then make sure that it is the right size
         if X.shape[1] != self._dim:
             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 
-        # If we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index).astype(float)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
-
         if not list(self.cols):
             return X
 
-        # Do not modify the input argument
-        X = X.copy(deep=True)
-
         X = self.ordinal_encoder.transform(X)
 
         if self.handle_unknown == 'error':

diff --git a/category_encoders/james_stein.py b/category_encoders/james_stein.py
@@ -177,12 +177,7 @@ def fit(self, X, y, **kwargs):
         """
 
         # Unite parameters into pandas types
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index).astype(float)
-
-        # The lengths must be equal
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+        X, y = util.convert_inputs(X, y)
 
         self._dim = X.shape[1]
 
@@ -273,24 +268,15 @@ def transform(self, X, y=None, override_return_df=False):
             raise ValueError('Must train encoder before it can be used to transform data.')
 
         # Unite the input into pandas types
-        X = util.convert_input(X)
+        X, y = util.convert_inputs(X, y, deep=True)
 
         # Then make sure that it is the right size
         if X.shape[1] != self._dim:
             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 
-        # If we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index).astype(float)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
-
         if not list(self.cols):
             return X
 
-        # Do not modify the input argument
-        X = X.copy(deep=True)
-
         X = self.ordinal_encoder.transform(X)
 
         if self.handle_unknown == 'error':

diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py
@@ -114,11 +114,8 @@ def fit(self, X, y, **kwargs):
         """
 
         # unite the input into pandas types
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index).astype(float)
-
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+        X, y = util.convert_inputs(X, y)
+        y = y.astype(float)
 
         self._dim = X.shape[1]
 
@@ -182,18 +179,14 @@ def transform(self, X, y=None, override_return_df=False):
             raise ValueError('Must train encoder before it can be used to transform data.')
 
         # unite the input into pandas types
-        X = util.convert_input(X)
+        X, y = util.convert_inputs(X, y, deep=True)
+        if y is not None:
+            y = y.astype(float)
 
         # then make sure that it is the right size
         if X.shape[1] != self._dim:
             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 
-        # if we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index).astype(float)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
-
         if not list(self.cols):
             return X
         X = self.transform_leave_one_out(
@@ -234,12 +227,11 @@ def fit_column_map(self, series, y):
         result = y.groupby(codes).agg(['sum', 'count'])
         return result.rename(return_map)
 
-    def transform_leave_one_out(self, X_in, y, mapping=None):
+    def transform_leave_one_out(self, X, y, mapping=None):
         """
         Leave one out encoding uses a single column of floats to represent the means of the target variables.
         """
 
-        X = X_in.copy(deep=True)
         random_state_ = check_random_state(self.random_state)
 
         for col, colmap in mapping.items():

diff --git a/category_encoders/m_estimate.py b/category_encoders/m_estimate.py
@@ -129,12 +129,7 @@ def fit(self, X, y, **kwargs):
         """
 
         # Unite parameters into pandas types
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index).astype(float)
-
-        # The lengths must be equal
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+        X, y = util.convert_inputs(X, y)
 
         self._dim = X.shape[1]
 
@@ -205,24 +200,15 @@ def transform(self, X, y=None, override_return_df=False):
             raise ValueError('Must train encoder before it can be used to transform data.')
 
         # Unite the input into pandas types
-        X = util.convert_input(X)
+        X, y = util.convert_inputs(X, y, deep=True)
 
         # Then make sure that it is the right size
         if X.shape[1] != self._dim:
             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 
-        # If we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index).astype(float)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
-
         if not list(self.cols):
             return X
 
-        # Do not modify the input argument
-        X = X.copy(deep=True)
-
         X = self.ordinal_encoder.transform(X)
 
         if self.handle_unknown == 'error':

diff --git a/category_encoders/quantile_encoder.py b/category_encoders/quantile_encoder.py
@@ -122,11 +122,8 @@ def fit(self, X, y, **kwargs):
         """
 
         # unite the input into pandas types
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index).astype(float)
-
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+        X, y = util.convert_inputs(X, y)
+        y = y.astype(float)
 
         self._dim = X.shape[1]
 
@@ -220,7 +217,7 @@ def transform(self, X, y=None, override_return_df=False):
             raise ValueError("Must train encoder before it can be used to transform data.")
 
         # unite the input into pandas types
-        X = util.convert_input(X)
+        X, y = util.convert_inputs(X, y)
 
         # then make sure that it is the right size
         if X.shape[1] != self._dim:
@@ -232,14 +229,6 @@ def transform(self, X, y=None, override_return_df=False):
                 )
             )
 
-        # if we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError(
-                    "The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + "."
-                )
-
         if not list(self.cols):
             return X
 
@@ -376,8 +365,7 @@ def __init__(
         self.encoder_list = None
 
     def fit(self, X, y):
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index).astype(float)
+        X, y = util.convert_inputs(X, y)
 
         if self.cols is None:
             self.cols = util.get_obj_cols(X)
@@ -418,15 +406,7 @@ def fit(self, X, y):
     def transform(self, X, y=None, override_return_df=False):
         if self.encoder_list is None:
             raise ValueError("Must train encoder before it can be used to transform data.")
-        X = util.convert_input(X)
-
-        # if we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError(
-                    "The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + "."
-                )
+        X, y = util.convert_inputs(X, y)
 
         orig_cols = X.columns
         transformed_df = X.copy()

diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py
@@ -115,11 +115,7 @@ def fit(self, X, y, **kwargs):
         """
 
         # unite the input into pandas types
-        X = util.convert_input(X)
-        y = util.convert_input_vector(y, X.index)
-
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+        X, y = util.convert_inputs(X, y)
 
         self._dim = X.shape[1]
 
@@ -213,18 +209,12 @@ def transform(self, X, y=None, override_return_df=False):
             raise ValueError('Must train encoder before it can be used to transform data.')
 
         # unite the input into pandas types
-        X = util.convert_input(X)
+        X, y = util.convert_inputs(X, y)
 
         # then make sure that it is the right size
         if X.shape[1] != self._dim:
             raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
 
-        # if we are encoding the training data, we have to check the target
-        if y is not None:
-            y = util.convert_input_vector(y, X.index)
-            if X.shape[0] != y.shape[0]:
-                raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
-
         if not list(self.cols):
             return X
 

diff --git a/category_encoders/utils.py b/category_encoders/utils.py
@@ -40,7 +40,43 @@ def is_category(dtype):
     return pd.api.types.is_categorical_dtype(dtype)
 
 
-def convert_input(X, columns=None, deep=False):
+def convert_inputs(X, y, columns=None, index=None, deep=False):
+    """
+    Unite arraylike `X` and vectorlike `y` into a DataFrame and Series.
+
+    If both are pandas types already, raises an error if their indexes do not match.
+    If one is pandas, the returns will share that index.
+    If neither is pandas, a default index will be used, unless `index` is passed.
+
+    Parameters
+    ----------
+    X: arraylike
+    y: listlike
+    columns: listlike
+        Specifies column names to use for `X`.
+        Ignored if `X` is already a dataframe.
+        If `None`, use the default pandas column names.
+    index: listlike
+        The index to use, if neither `X` nor `y` is a pandas type.
+        (If one has an index, then this has no effect.)
+        If `None`, use the default pandas index.
+    deep: bool
+        Whether to deep-copy `X`.
+    """
+    X_alt_index = y.index if isinstance(y, pd.Series) else index
+    X = convert_input(X, columns=columns, deep=deep, index=X_alt_index)
+    if y is not None:
+        y = convert_input_vector(y, index=X.index)
+        # N.B.: If either was already pandas, it keeps its index.
+
+        if any(X.index != y.index):
+            raise ValueError("`X` and `y` both have indexes, but they do not match.")
+        if X.shape[0] != y.shape[0]:
+            raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
+    return X, y
+
+
+def convert_input(X, columns=None, deep=False, index=None):
     """
     Unite data into a DataFrame.
     Objects that do not contain column names take the names from the argument.
@@ -53,11 +89,11 @@ def convert_input(X, columns=None, deep=False):
             if columns is not None and np.size(X,1) != len(columns):
                 raise ValueError('The count of the column names does not correspond to the count of the columns')
             if isinstance(X, list):
-                X = pd.DataFrame(X, columns=columns, copy=deep)  # lists are always copied, but for consistency, we still pass the argument
+                X = pd.DataFrame(X, columns=columns, copy=deep, index=index)  # lists are always copied, but for consistency, we still pass the argument
             elif isinstance(X, (np.generic, np.ndarray)):
-                X = pd.DataFrame(X, columns=columns, copy=deep)
+                X = pd.DataFrame(X, columns=columns, copy=deep, index=index)
             elif isinstance(X, csr_matrix):
-                X = pd.DataFrame(X.todense(), columns=columns, copy=deep)
+                X = pd.DataFrame(X.todense(), columns=columns, copy=deep, index=index)
             else:
                 raise ValueError('Unexpected input type: %s' % (str(type(X))))
     elif deep:
@@ -88,8 +124,10 @@ def convert_input_vector(y, index):
     elif np.isscalar(y):
         return pd.Series([y], name='target', index=index)
     elif isinstance(y, list):
-        if len(y)==0 or (len(y)>0 and not isinstance(y[0], list)): # empty list or a vector
+        if len(y)==0:  # empty list
             return pd.Series(y, name='target', index=index, dtype=float)
+        elif len(y)>0 and not isinstance(y[0], list):  # vector
+            return pd.Series(y, name='target', index=index)
         elif len(y)>0 and isinstance(y[0], list) and len(y[0])==1: # single row in a matrix
             flatten = lambda y: [item for sublist in y for item in sublist]
             return pd.Series(flatten(y), name='target', index=index)